READ THE DATA

library(dplyr)
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(data.table)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
data.table 1.12.8 using 4 threads (see ?getDTthreads).  Latest news: r-datatable.com

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last
library(mltools)
chicago_crime <- read.table(file = "chicago_crime_clean.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)

chicago_crime$location_description <- (gsub(","," ",chicago_crime$location_description))
chicago_crime$description <- gsub(":=","",chicago_crime$description)
chicago_crime$description <- gsub(":","",chicago_crime$description)
chicago_crime$description <- gsub("MANU/POSS. W/","",chicago_crime$description)
chicago_crime$description <- gsub(",","",chicago_crime$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime$location_description <- gsub("(E.G.  UBER  LYFT)","",chicago_crime$location_description)
chicago_crime$location_description <- gsub(",","",chicago_crime$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)

chicago_crime <- chicago_crime %>%
    dplyr::mutate(year = lubridate::year(date), 
                month = lubridate::month(date), 
                day = lubridate::day(date))
chicago_crime <- na.omit(chicago_crime)
chicago_crime <- select(chicago_crime,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))
chicago_crime$district <- factor(chicago_crime$district)

unique(chicago_crime$primary_type)
 [1] "ASSAULT"                           "OTHER OFFENSE"                     "NARCOTICS"                        
 [4] "DECEPTIVE PRACTICE"                "CRIMINAL TRESPASS"                 "WEAPONS VIOLATION"                
 [7] "CRIM SEXUAL ASSAULT"               "BURGLARY"                          "MOTOR VEHICLE THEFT"              
[10] "KIDNAPPING"                        "PUBLIC PEACE VIOLATION"            "INTERFERENCE WITH PUBLIC OFFICER" 
[13] "BATTERY"                           "GAMBLING"                          "ROBBERY"                          
[16] "OFFENSE INVOLVING CHILDREN"        "SEX OFFENSE"                       "THEFT"                            
[19] "CONCEALED CARRY LICENSE VIOLATION" "CRIMINAL DAMAGE"                   "ARSON"                            
[22] "HOMICIDE"                          "LIQUOR LAW VIOLATION"              "STALKING"                         
[25] "INTIMIDATION"                      "PROSTITUTION"                      "HUMAN TRAFFICKING"                
[28] "OBSCENITY"                         "OTHER NARCOTIC VIOLATION"          "PUBLIC INDECENCY"                 
[31] "NON-CRIMINAL"                     
head(chicago_crime)
summary(chicago_crime)
 case_number           block           primary_type       description        location_description    arrest             district     
 Length:213687      Length:213687      Length:213687      Length:213687      Length:213687        Length:213687      11     : 14869  
 Class :character   Class :character   Class :character   Class :character   Class :character     Class :character   18     : 14243  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character     Mode  :character   1      : 14227  
                                                                                                                     6      : 12717  
                                                                                                                     8      : 12577  
                                                                                                                     12     : 11592  
                                                                                                                     (Other):133462  
      ward          latitude       longitude          month             day      
 Min.   : 1.00   Min.   :36.62   Min.   :-91.69   Min.   : 1.000   Min.   : 1.0  
 1st Qu.:10.00   1st Qu.:41.77   1st Qu.:-87.71   1st Qu.: 4.000   1st Qu.: 8.0  
 Median :24.00   Median :41.87   Median :-87.66   Median : 7.000   Median :15.0  
 Mean   :23.83   Mean   :41.85   Mean   :-87.67   Mean   : 6.575   Mean   :15.5  
 3rd Qu.:36.00   3rd Qu.:41.91   3rd Qu.:-87.63   3rd Qu.: 9.000   3rd Qu.:23.0  
 Max.   :50.00   Max.   :42.02   Max.   :-87.52   Max.   :12.000   Max.   :31.0  
                                                                                 

Read the training set

library(dplyr)
library(data.table)
library(mltools)
chicago_crime_tr <- read.table(file = "chicago_crime_tr.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)

chicago_crime_tr$location_description <- (gsub(","," ",chicago_crime_tr$location_description))
chicago_crime_tr$description <- gsub(":=","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(":","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub("MANU/POSS. W/","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(",","",chicago_crime_tr$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime_tr$location_description <- gsub("(E.G.  UBER  LYFT)","",chicago_crime_tr$location_description)
chicago_crime_tr$location_description <- gsub(",","",chicago_crime_tr$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)

chicago_crime_tr <- chicago_crime_tr %>%
    dplyr::mutate(year = lubridate::year(date), 
                month = lubridate::month(date), 
                day = lubridate::day(date))

chicago_crime_tr <- select(chicago_crime_tr,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))

chicago_crime_tr <- na.omit(chicago_crime_tr)
chicago_crime_tr$district <- factor(chicago_crime_tr$district)

unique(chicago_crime_tr$primary_type)
 [1] "INTERFERENCE WITH PUBLIC OFFICER"  "OTHER OFFENSE"                     "DECEPTIVE PRACTICE"               
 [4] "SEX OFFENSE"                       "CRIM SEXUAL ASSAULT"               "BATTERY"                          
 [7] "CRIMINAL TRESPASS"                 "MOTOR VEHICLE THEFT"               "THEFT"                            
[10] "ASSAULT"                           "NARCOTICS"                         "ROBBERY"                          
[13] "PUBLIC PEACE VIOLATION"            "WEAPONS VIOLATION"                 "STALKING"                         
[16] "OFFENSE INVOLVING CHILDREN"        "ARSON"                             "BURGLARY"                         
[19] "CRIMINAL DAMAGE"                   "HOMICIDE"                          "INTIMIDATION"                     
[22] "KIDNAPPING"                        "HUMAN TRAFFICKING"                 "PROSTITUTION"                     
[25] "OBSCENITY"                         "CONCEALED CARRY LICENSE VIOLATION" "CRIMINAL SEXUAL ASSAULT"          
[28] "NON-CRIMINAL"                      "PUBLIC INDECENCY"                  "LIQUOR LAW VIOLATION"             
[31] "GAMBLING"                          "OTHER NARCOTIC VIOLATION"          "NON - CRIMINAL"                   
[34] "NON-CRIMINAL (SUBJECT SPECIFIED)" 
head(chicago_crime_tr)
summary(chicago_crime_tr)
 case_number           block           primary_type       description        location_description    arrest             district     
 Length:666666      Length:666666      Length:666666      Length:666666      Length:666666        Length:666666      11     : 45269  
 Class :character   Class :character   Class :character   Class :character   Class :character     Class :character   1      : 42067  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character     Mode  :character   18     : 41971  
                                                                                                                     8      : 41215  
                                                                                                                     6      : 37548  
                                                                                                                     12     : 35981  
                                                                                                                     (Other):422615  
      ward          latitude       longitude          month             day       
 Min.   : 1.00   Min.   :41.64   Min.   :-87.93   Min.   : 1.000   Min.   : 1.00  
 1st Qu.:11.00   1st Qu.:41.77   1st Qu.:-87.71   1st Qu.: 4.000   1st Qu.: 8.00  
 Median :24.00   Median :41.87   Median :-87.66   Median : 7.000   Median :16.00  
 Mean   :23.76   Mean   :41.85   Mean   :-87.67   Mean   : 6.594   Mean   :15.72  
 3rd Qu.:36.00   3rd Qu.:41.91   3rd Qu.:-87.63   3rd Qu.: 9.000   3rd Qu.:23.00  
 Max.   :50.00   Max.   :42.02   Max.   :-87.52   Max.   :12.000   Max.   :31.00  
                                                                                  
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime_tr$primary_type <- factor(chicago_crime_tr$primary_type)

chicago_crime_subset_tr <- subset(chicago_crime_tr, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )

chicago_crime_subset_tr$primary_type <- factor(chicago_crime_subset_tr$primary_type)
chicago_crime_subset_tr <- na.omit(chicago_crime_subset_tr)
library(DataExplorer)
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
plot_str(chicago_crime_subset_tr)

plot_missing(chicago_crime_subset_tr)

#plot_histogram(chicago_crime_subset)
#plot_density(chicago_crime_subset)
#plot_correlation(chicago_numeric, type = 'continuous')
chicago_crime_subset_tr$month <- as.factor(chicago_crime_subset_tr$month)

plot_bar(chicago_crime_subset_tr)
4 columns ignored with more than 50 categories.
case_number: 610345 categories
block: 31746 categories
description: 210 categories
location_description: 162 categories

EXPLORATORY ANALYSIS

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.2.1     ✓ purrr   0.3.3
✓ tibble  2.1.3     ✓ stringr 1.4.0
✓ tidyr   1.0.2     ✓ forcats 0.4.0
✓ readr   1.3.1     
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x data.table::between() masks dplyr::between()
x dplyr::filter()       masks stats::filter()
x data.table::first()   masks dplyr::first()
x dplyr::lag()          masks stats::lag()
x data.table::last()    masks dplyr::last()
x tidyr::replace_na()   masks mltools::replace_na()
x purrr::transpose()    masks data.table::transpose()
ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


chicago_crime %>% 
  count(primary_type)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1))


chicago_crime %>% 
  count(district)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = arrest)) +
  theme(axis.text.x = element_text(hjust = 1))


chicago_crime %>% 
  count(arrest)

#chicago_crime$primary_type <- as.character(junk$nm)
chicago_crime$primary_type[chicago_crime$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime$primary_type[chicago_crime$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime$primary_type <- factor(chicago_crime$primary_type)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


chicago_crime %>% 
  count(primary_type)

chicago_crime_subset <- subset(chicago_crime, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )
chicago_crime_subset$primary_type <- factor(chicago_crime_subset$primary_type)
ggplot(data = chicago_crime_subset) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = primary_type, y = district)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = arrest, y = primary_type)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = arrest, y = district)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

EXPLORATORY ANALYSIS BY CRIME

assault <- subset(chicago_crime_subset, primary_type=="ASSAULT")
violent_crime <- subset(chicago_crime_subset, primary_type=="VIOLENT CRIME")
theft <- subset(chicago_crime_subset, primary_type=="THEFT")
narcotics <- subset(chicago_crime_subset, primary_type=="NARCOTICS")
weapons_violation <- subset(chicago_crime_subset, primary_type=="WEAPONS VIOLATION")
robbery <- subset(chicago_crime_subset, primary_type=="ROBBERY")
criminal_damage <- subset(chicago_crime_subset, primary_type=="CRIMINAL DAMAGE")
deceptive_practice <- subset(chicago_crime_subset, primary_type=="DECEPTIVE PRACTICE")

assault_tr <- subset(chicago_crime_subset_tr, primary_type=="ASSAULT")
violent_tr_crime <- subset(chicago_crime_subset_tr, primary_type=="VIOLENT CRIME")
theft_tr <- subset(chicago_crime_subset_tr, primary_type=="THEFT")
narcotics_tr <- subset(chicago_crime_subset_tr, primary_type=="NARCOTICS")
weapons_violation_tr <- subset(chicago_crime_subset_tr, primary_type=="WEAPONS VIOLATION")
robbery_tr <- subset(chicago_crime_subset_tr, primary_type=="ROBBERY")
criminal_damage_tr <- subset(chicago_crime_subset_tr, primary_type=="CRIMINAL DAMAGE")
deceptive_practice_tr <- subset(chicago_crime_subset_tr, primary_type=="DECEPTIVE PRACTICE")

DISTRICTS

library(sqldf)
Loading required package: gsubfn
Loading required package: proto
unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
  dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
  Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
  Reason: image not foundCould not load tcltk.  Will use slower R code instead.
Loading required package: RSQLite
districts_true <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as arrest FROM chicago_crime_subset WHERE arrest LIKE "True" GROUP BY district ORDER BY district')
districts_false <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as no_arrest FROM chicago_crime_subset WHERE arrest LIKE "False" GROUP BY district ORDER BY district')
districts_true$arrest <- as.numeric(districts_true$arrest)
districts_false$no_arrest <- as.numeric(districts_false$no_arrest)
districts_true
districts_false

police_districts <- read.table(file = "Police_Stations.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)
police_districts

police_districts$DISTRICT[police_districts$DISTRICT == "Headquarters"] <- "0"
police_districts$DISTRICT <- as.factor(police_districts$DISTRICT)

districts <- sqldf('SELECT DISTRICT as district, LATITUDE as latitude,LONGITUDE as longitude FROM police_districts')

arrest_percentage <- data.frame('District' = districts_false$district, 'PctArrest' = districts_true$arrest/(districts_true$arrest + districts_false$no_arrest), 'Crimes' = (districts_true$arrest + districts_false$no_arrest))
arrest_percentage

ggplot(data = arrest_percentage) +
  geom_col(mapping = aes(x = District, y = Crimes)) +
  geom_line(aes(x = District, y = PctArrest*10000, group = 1), color = "yellow") +
  scale_y_continuous(sec.axis = sec_axis(~./10000, name = "PctArrest")) +
  theme(axis.text.x = element_text(hjust = 1))


## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
rgdal: version: 1.4-8, (SVN revision 845)
 Geospatial Data Abstraction Library extensions to R successfully loaded
 Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
 Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
 GDAL binary built with GEOS: FALSE 
 Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
 Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
 Linking to sp version: 1.3-2 
# library("maptools")
library("KernSmooth")
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
setDT(districts_false)

#devtools::install_github("dkahle/ggmap", ref = "tidyup", force = TRUE)
library(ggmap)
Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
Please cite ggmap if you use it! See citation("ggmap") for details.
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949, 
                                  right = -87.2713, top = 42.0677), 
                         zoom = 11)
Source : http://tile.stamen.com/terrain/11/523/759.png
Source : http://tile.stamen.com/terrain/11/524/759.png
Source : http://tile.stamen.com/terrain/11/525/759.png
Source : http://tile.stamen.com/terrain/11/526/759.png
Source : http://tile.stamen.com/terrain/11/527/759.png
Source : http://tile.stamen.com/terrain/11/523/760.png
Source : http://tile.stamen.com/terrain/11/524/760.png
Source : http://tile.stamen.com/terrain/11/525/760.png
Source : http://tile.stamen.com/terrain/11/526/760.png
Source : http://tile.stamen.com/terrain/11/527/760.png
Source : http://tile.stamen.com/terrain/11/523/761.png
Source : http://tile.stamen.com/terrain/11/524/761.png
Source : http://tile.stamen.com/terrain/11/525/761.png
Source : http://tile.stamen.com/terrain/11/526/761.png
Source : http://tile.stamen.com/terrain/11/527/761.png
Source : http://tile.stamen.com/terrain/11/523/762.png
Source : http://tile.stamen.com/terrain/11/524/762.png
Source : http://tile.stamen.com/terrain/11/525/762.png
Source : http://tile.stamen.com/terrain/11/526/762.png
Source : http://tile.stamen.com/terrain/11/527/762.png
Source : http://tile.stamen.com/terrain/11/523/763.png
Source : http://tile.stamen.com/terrain/11/524/763.png
Source : http://tile.stamen.com/terrain/11/525/763.png
Source : http://tile.stamen.com/terrain/11/526/763.png
Source : http://tile.stamen.com/terrain/11/527/763.png
ggmap(chicago) +
geom_text(aes(x = longitude, y = latitude, label = district), data = districts)


library(ggmap)
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949, 
                                  right = -87.2713, top = 42.0677), 
                         zoom = 11)
ggmap(chicago) +
geom_text(aes(x = LONGITUDE, y = LATITUDE, label = DISTRICT), data = police_districts)

ggplot(data = assault) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("ASSAULT BY DISTRICT")


ggplot(data = theft) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("THEFTS BY DISTRICT")


ggplot(data = violent_crime) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("VIOLENT CRIMES BY DISTRICT")


ggplot(data = narcotics) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("NARCOTIC CRIMES BY DISTRICT")


ggplot(data = weapons_violation) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("WEAPON-RELATED CRIMES BY DISTRICT")


ggplot(data = robbery) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("ROBBERIES BY DISTRICT")


ggplot(data = criminal_damage) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("CRIMINAL DAMAGE CRIMES BY DISTRICT")


ggplot(data = deceptive_practice) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("DECEPTIVE PRACTICE CRIMES BY DISTRICT")

library(ggplot2)

ggplot(data = chicago_crime_subset, aes(x=primary_type, y=district, fill=arrest)) + 
  geom_tile() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Correlation
library(ggplot2)
ggplot(chicago_crime_subset,aes(x=district,y=primary_type,color=arrest))+geom_point(alpha=0.5)

Association Rules

chicago_crime_subset_2 <- subset(chicago_crime_subset, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_subset_2 <- subset(chicago_crime_subset_2, select=-c(location_description))
write.csv(chicago_crime_subset_2,"chicago_crime_AR.csv", quote = FALSE, row.names = FALSE)
library(arules)
Loading required package: Matrix

Attaching package: ‘Matrix’

The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Attaching package: ‘arules’

The following object is masked from ‘package:dplyr’:

    recode

The following objects are masked from ‘package:base’:

    abbreviate, write
crime_transactions <- read.transactions("chicago_crime_AR.csv", sep=",")

#deceptive_practice_2 <- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#write.csv(deceptive_practice_2,"deceptive_practice.csv", quote = FALSE, row.names = FALSE)
#dp_transactions <- read.transactions("deceptive_practice.csv", sep=",")
if (!require("RColorBrewer")) {
  # install color package of R
  install.packages("RColorBrewer")
  #include library RColorBrewer
  library(RColorBrewer)
}
Loading required package: RColorBrewer
itemFrequencyPlot(crime_transactions,topN=20,type="absolute",
                  col=brewer.pal(8,'Pastel2'), 
                  main="Absolute Item Frequency Plot")

Reglas de Asociacion General

# Rule GENERATION
association.rules.clean <- apriori(crime_transactions, parameter = list(supp=0.001, conf=0.7))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [179 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
subset.rules.clean <- which(colSums(is.subset(association.rules.clean, association.rules.clean)) > 1)
subset.association.rules.clean. <- association.rules.clean[-subset.rules.clean]
inspect(subset.association.rules.clean.)

rules_by_count <- sort(association.rules.clean, by = "count")
rules_by_conf <- sort(association.rules.clean, by = "confidence")
rules_by_supp <- sort(association.rules.clean, by = "lift")
inspect(rules_by_count)
inspect(rules_by_conf)
inspect(rules_by_supp)
# Rule GENERATION
assault.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                   appearance = list(default="lhs",rhs="ASSAULT"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [7 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
assault.subset.rules <- which(colSums(is.subset(assault.association.rules, assault.association.rules)) > 1) # get subset rules in vector
assault.subset.association.rules. <- assault.association.rules[-assault.subset.rules] # remove subset rules.
inspect(assault.subset.association.rules.)

as_by_count <- sort(assault.association.rules, by = "count")
as_by_conf <- sort(assault.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(as_by_count)
inspect(as_by_conf)
#inspect(dp_by_supp)
# Rule GENERATION
cd.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                     appearance = list(default="lhs",rhs="CRIMINAL DAMAGE"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [40 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
cd.subset.rules <- which(colSums(is.subset(cd.association.rules, cd.association.rules)) > 1) # get subset rules in vector
cd.subset.association.rules. <- cd.association.rules[-cd.subset.rules] # remove subset rules.
inspect(cd.association.rules)

cd_by_count <- sort(cd.association.rules, by = "count")
cd_by_conf <- sort(cd.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(cd_by_count)
inspect(cd_by_conf)
#inspect(dp_by_supp)
# Rule GENERATION
dp.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                     appearance = list(default="lhs",rhs="DECEPTIVE PRACTICE"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [19 rule(s)] done [0.00s].
creating S4 object  ... done [0.05s].
# Borrar reglas redundantes
dp.subset.rules <- which(colSums(is.subset(dp.association.rules, dp.association.rules)) > 1) # get subset rules in vector
dp.subset.association.rules. <- dp.association.rules[-dp.subset.rules] # remove subset rules.
inspect(dp.subset.association.rules.)

dp_by_count <- sort(dp.subset.association.rules., by = "count")
dp_by_conf <- sort(dp.subset.association.rules., by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(dp_by_count)
inspect(dp_by_conf)
#inspect(dp_by_supp)
narcotics_clean.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                   appearance = list(default="lhs",rhs="NARCOTICS"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [18 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
narcotics_clean.subset.rules <- which(colSums(is.subset(narcotics_clean.association.rules, narcotics_clean.association.rules)) > 1) # get subset rules in vector
narcotics_clean.subset.association.rules. <- narcotics_clean.association.rules[-narcotics_clean.subset.rules] # remove subset rules.
inspect(narcotics_clean.subset.association.rules.)

narc_by_count <- sort(narcotics_clean.association.rules, by = "count")
narc_by_conf <- sort(narcotics_clean.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(narc_by_count)
inspect(narc_by_conf)
#inspect(dp_by_supp)
robbery.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.15),
                                   appearance = list(default="lhs",rhs="ROBBERY"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [11 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
robbery.subset.rules <- which(colSums(is.subset(robbery.association.rules, robbery.association.rules)) > 1) 
robbery.subset.association.rules. <- robbery.association.rules[-robbery.subset.rules] # remove subset rules.
inspect(robbery.association.rules)

rob_by_count <- sort(robbery.association.rules, by = "count")
rob_by_conf <- sort(robbery.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(rob_by_count)
inspect(rob_by_conf)
#inspect(dp_by_supp)
theft.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.005, conf=0.5),
                                   appearance = list(default="lhs",rhs="THEFT"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 972 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [4 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
theft.subset.rules <- which(colSums(is.subset(theft.association.rules, theft.association.rules)) > 1) 
theft.subset.association.rules. <- theft.association.rules[-theft.subset.rules] # remove subset rules.
inspect(theft.subset.association.rules.)

theft_by_count <- sort(theft.association.rules, by = "count")
theft_by_conf <- sort(theft.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(theft_by_count)
inspect(theft_by_conf)
#inspect(dp_by_supp)
vc.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.15),
                                   appearance = list(default="lhs",rhs="VIOLENT CRIME"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [21 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
vc.subset.rules <- which(colSums(is.subset(vc.association.rules, vc.association.rules)) > 1) # get subset rules in  
vc.subset.association.rules. <- vc.association.rules[-vc.subset.rules] # remove subset rules.
inspect(vc.subset.association.rules.)

vc_by_count <- sort(vc.association.rules, by = "count")
vc_by_conf <- sort(vc.association.rules, by = "confidence")
#vc_by_supp <- sort(vc.subset.association.rules., by = "support")
inspect(vc_by_count)
inspect(vc_by_conf)
#inspect(wv_by_supp)
wv.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.1),
                                      appearance = list(default="lhs",rhs="WEAPONS VIOLATION"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [8 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
wv.subset.rules <- which(colSums(is.subset(wv.association.rules, wv.association.rules)) > 1) # get subset rules in  
wv.subset.association.rules. <- wv.association.rules[-wv.subset.rules] # remove subset rules.
inspect(wv.subset.association.rules.)

wv_by_count <- sort(wv.association.rules, by = "count")
wv_by_conf <- sort(wv.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(wv_by_count)
inspect(wv_by_conf)
#inspect(wv_by_supp)
true.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.5),
                                      appearance = list(default="lhs",rhs="True"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [24 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
true.subset.rules <- which(colSums(is.subset(true.association.rules, true.association.rules)) > 1) # get subset rules in  
true.subset.association.rules. <- true.association.rules[-true.subset.rules] # remove subset rules.
inspect(true.subset.association.rules.)

t_by_count <- sort(true.subset.association.rules., by = "count")
t_by_conf <- sort(true.subset.association.rules., by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(t_by_count)
inspect(t_by_conf)
#inspect(wv_by_supp)
false.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.8),
                                      appearance = list(default="lhs",rhs="False"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 194 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.04s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [125 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
false.subset.rules <- which(colSums(is.subset(false.association.rules, false.association.rules)) > 1) # get subset rules in  
false.subset.association.rules. <- false.association.rules[-false.subset.rules] # remove subset rules.
inspect(false.subset.association.rules.)

f_by_count <- sort(false.association.rules, by = "count")
f_by_conf <- sort(false.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(f_by_count)
inspect(f_by_conf)
#inspect(wv_by_supp)
ocho.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.0001, conf=0.01),
                                      appearance = list(default="lhs",rhs="8"))
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 19 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[36 item(s), 194542 transaction(s)] done [0.03s].
sorting and recoding items ... [32 item(s)] done [0.00s].
creating transaction tree ... done [0.04s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [26 rule(s)] done [0.00s].
creating S4 object  ... done [0.02s].
# Borrar reglas redundantes
ocho.subset.rules <- which(colSums(is.subset(ocho.association.rules, ocho.association.rules)) > 1) # get subset rules in  
ocho.subset.association.rules. <- ocho.association.rules[-ocho.subset.rules] # remove subset rules.
inspect(ocho.subset.association.rules.)

ocho_by_count <- sort(ocho.association.rules, by = "count")
ocho_by_conf <- sort(ocho.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(ocho_by_count)
inspect(ocho_by_conf)
#inspect(wv_by_supp)
## GRAFICOS 
## Dataset Entero
library(arulesViz)
Loading required package: grid
Registered S3 method overwritten by 'seriation':
  method         from 
  reorder.hclust gclus
# Filter rules with confidence greater than 0.4 or 40%
subRules<-association.rules.clean[quality(association.rules.clean)$confidence>0.7]
#Plot SubRules
plot(subRules,method="two-key plot")


## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(subRules, n = 25, by = "confidence")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(top10subRules, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules, n=25, by="confidence")
plot(subRules2, method="paracoord")

#Plot SubRules
plot(assault.subset.association.rules.,method="two-key plot")


## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(assault.subset.association.rules., n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(assault.subset.association.rules., method = "graph",  engine = "htmlwidget")


## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(assault.subset.association.rules., n=20, by="confidence")
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(cd.association.rules,method="two-key plot")


subRules_cd<-cd.association.rules[quality(cd.association.rules)$confidence>0.2]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(cd.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_cd, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_cd, n=25, by="confidence")
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(dp.association.rules,method="two-key plot")


subRules_dp<-dp.association.rules[quality(dp.association.rules)$confidence>0.1]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(dp.association.rules, n = 10, by = "count")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_dp, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_dp, n=25, by="count")
plot(subRules_dp, method="paracoord")

#Plot SubRules
plot(narcotics_clean.association.rules,method="two-key plot")


subRules_narcotics<-narcotics_clean.association.rules[quality(narcotics_clean.association.rules)$confidence>0.6]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(narcotics_clean.association.rules, n = 10, by = "confidence")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_narcotics, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_narcotics, n=25, by="confidence")
plot(subRules_narcotics, method="paracoord")

#Plot SubRules
plot(robbery.association.rules,method="two-key plot")


subRules_robbery<-robbery.association.rules[quality(robbery.association.rules)$confidence>0.15]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(robbery.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_robbery, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_robbery, n=25, by="confidence")
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(theft.association.rules,method="two-key plot")


subRules_theft<-theft.association.rules[quality(theft.association.rules)$confidence>0.45]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(theft.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_theft, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(vc.association.rules,method="two-key plot")


subRules_vc<-vc.association.rules[quality(vc.association.rules)$confidence>0.15]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(vc.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_vc, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(wv.association.rules,method="two-key plot")


subRules_wv<-wv.association.rules[quality(wv.association.rules)$confidence>0.1]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(wv.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_wv, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation
plot(top10subRules, method="paracoord")

#Plot SubRules
plot(ocho.association.rules,method="two-key plot")


subRules_8<-ocho.association.rules[quality(ocho.association.rules)$confidence>0.01]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(ocho.association.rules, n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_8, method = "graph",  engine = "htmlwidget")


## Individual Rule Representation
plot(top10subRules, method="paracoord")

Mapas de Densidad

## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
# library("maptools")
library("KernSmooth")
library(viridis)
Loading required package: viridisLite
library(RColorBrewer)

assault <- na.omit(assault)
setDT(assault)
criminal_damage <- na.omit(criminal_damage)
setDT(criminal_damage)
deceptive_practice <- na.omit(deceptive_practice)
setDT(deceptive_practice)
narcotics <- na.omit(narcotics)
setDT(narcotics)
robbery <- na.omit(robbery)
setDT(robbery)
theft <- na.omit(theft)
setDT(theft)
violent_crime <- na.omit(violent_crime)
setDT(violent_crime)
weapons_violation <- na.omit(weapons_violation)
setDT(weapons_violation)

## MAKE CONTOUR LINES
## Assault
kde_assault <- bkde2D(assault[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_assault <- contourLines(kde_assault$x1 , kde_assault$x2 , kde_assault$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_assault<- as.factor(sapply(CL_assault, `[[`, "level"))
NLEV_assault <- length(levels(LEVS_assault))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_assault <- lapply(1:length(CL_assault), function(i)
    Polygons(list(Polygon(cbind(CL_assault[[i]]$x, CL_assault[[i]]$y))), ID=i))
spgons_assault = SpatialPolygons(pgons_assault)

## Criminal Damage
kde_cd <- bkde2D(criminal_damage[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_cd <- contourLines(kde_cd$x1 , kde_cd$x2 , kde_cd$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_cd<- as.factor(sapply(CL_cd, `[[`, "level"))
NLEV_cd <- length(levels(LEVS_cd))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_cd <- lapply(1:length(CL_cd), function(i)
    Polygons(list(Polygon(cbind(CL_cd[[i]]$x, CL_cd[[i]]$y))), ID=i))
less than 4 coordinates in polygonless than 4 coordinates in polygon
spgons_cd = SpatialPolygons(pgons_cd)

## Deceptive Practice
kde_dp <- bkde2D(deceptive_practice[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_dp <- contourLines(kde_dp$x1 , kde_dp$x2 , kde_dp$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_dp<- as.factor(sapply(CL_dp, `[[`, "level"))
NLEV_dp <- length(levels(LEVS_dp))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_dp <- lapply(1:length(CL_dp), function(i)
    Polygons(list(Polygon(cbind(CL_dp[[i]]$x, CL_dp[[i]]$y))), ID=i))
spgons_dp = SpatialPolygons(pgons_dp)

## Narcotics
kde_narcotics <- bkde2D(narcotics[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_narcotics <- contourLines(kde_narcotics$x1 , kde_narcotics$x2 , kde_narcotics$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_narcotics <- as.factor(sapply(CL_narcotics, `[[`, "level"))
NLEV_narcotics <- length(levels(LEVS_narcotics))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_narcotics <- lapply(1:length(CL_narcotics), function(i)
    Polygons(list(Polygon(cbind(CL_narcotics[[i]]$x, CL_narcotics[[i]]$y))), ID=i))
spgons_narcotics = SpatialPolygons(pgons_narcotics)

## Robbery
kde_robbery <- bkde2D(robbery[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_robbery <- contourLines(kde_robbery$x1 , kde_robbery$x2 , kde_robbery$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_robbery <- as.factor(sapply(CL_robbery, `[[`, "level"))
NLEV_robbery <- length(levels(LEVS_robbery))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_robbery <- lapply(1:length(CL_robbery), function(i)
    Polygons(list(Polygon(cbind(CL_robbery[[i]]$x, CL_robbery[[i]]$y))), ID=i))
spgons_robbery = SpatialPolygons(pgons_robbery)

## Thefts
kde_theft <- bkde2D(theft[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_theft <- contourLines(kde_theft$x1 , kde_theft$x2 , kde_theft$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_theft <- as.factor(sapply(CL_theft, `[[`, "level"))
NLEV_theft <- length(levels(LEVS_theft))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_theft <- lapply(1:length(CL_theft), function(i)
    Polygons(list(Polygon(cbind(CL_theft[[i]]$x, CL_theft[[i]]$y))), ID=i))
spgons_theft = SpatialPolygons(pgons_theft)

## Violent Crimws
kde_vc <- bkde2D(violent_crime[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_vc <- contourLines(kde_vc$x1 , kde_vc$x2 , kde_vc$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_vc <- as.factor(sapply(CL_vc, `[[`, "level"))
NLEV_vc <- length(levels(LEVS_vc))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_vc <- lapply(1:length(CL_vc), function(i)
    Polygons(list(Polygon(cbind(CL_vc[[i]]$x, CL_vc[[i]]$y))), ID=i))
spgons_vc = SpatialPolygons(pgons_vc)

## Weapons Violation
kde_wv <- bkde2D(weapons_violation[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
Binning grid too coarse for current (small) bandwidth: consider increasing 'gridsize'
CL_wv <- contourLines(kde_wv$x1 , kde_wv$x2 , kde_wv$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_wv <- as.factor(sapply(CL_wv, `[[`, "level"))
NLEV_wv <- length(levels(LEVS_wv))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_wv <- lapply(1:length(CL_wv), function(i)
    Polygons(list(Polygon(cbind(CL_wv[[i]]$x, CL_wv[[i]]$y))), ID=i))
less than 4 coordinates in polygon
spgons_wv = SpatialPolygons(pgons_wv)

leaflet() %>% addTiles() %>%
    addPolygons(data = spgons_narcotics, color = brewer.pal(NLEV_narcotics, name = "YlOrRd")[LEVS_narcotics], group = "Narcotics") %>%
    addPolygons(data = spgons_assault, color = brewer.pal(NLEV_assault, name = "Reds")[LEVS_assault], group = "Assault") %>%
    addPolygons(data = spgons_cd, color = brewer.pal(NLEV_cd, name="YlGnBu")[LEVS_cd], group = "Criminal Damage") %>%
    addPolygons(data = spgons_dp, color = brewer.pal(NLEV_dp, name = "YlGn")[LEVS_dp], group = "Deceptive Practice") %>%
    addPolygons(data = spgons_robbery, color = brewer.pal(NLEV_robbery, name = "Purples")[LEVS_robbery], group = "Robbery") %>%
    addPolygons(data = spgons_theft, color = brewer.pal(NLEV_theft, name = "Oranges")[LEVS_theft], group = "Thefts") %>%
    addPolygons(data = spgons_vc, color = brewer.pal(NLEV_vc, name = "Greys")[LEVS_vc], group = "Violent Crimes") %>%
    addPolygons(data = spgons_wv, color = brewer.pal(NLEV_wv, name = "Blues")[LEVS_wv], group = "Weapons Violation") %>%
    addLabelOnlyMarkers(districts$longitude, districts$latitude, label =  districts$district, 
                      labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T), group = "Districts") %>%
    addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))
n too large, allowed maximum for palette YlOrRd is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette Reds is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette YlGnBu is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette YlGn is 9
Returning the palette you asked for with that many colors
n too large, allowed maximum for palette Greys is 9
Returning the palette you asked for with that many colors

    
#addCircles(lng = narcotics$longitude, lat = narcotics$latitude,radius = .1, opacity = .4, col = "blue", group = "Points") %>%
#leaflet() %>% addTiles() %>%
#    addCircles(lng = weapons_violation$longitude, lat = weapons_violation$latitude,radius = .05, opacity = 0.1, col = brewer.pal(10,name = "Reds"), group = "Narcotics") %>%
#    addLabelOnlyMarkers(districts$longitude, districts$latitude, label =  districts$district, 
#                      labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T, textsize = "15px"), group = #"Districts") %>%
#    addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))

Clustering

chicago_crime_clustering <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
unique(chicago_crime_clustering$primary_type)
[1] ASSAULT            NARCOTICS          DECEPTIVE PRACTICE ROBBERY            WEAPONS VIOLATION  THEFT              VIOLENT CRIME     
[8] CRIMINAL DAMAGE   
Levels: ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))

types <- unique(chicago_crime_clustering$primary_type)
chicago_crime_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
chicago_crime_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#chicago_crime_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(chicago_crime_clustering$location_description))
chicago_crime_clustering$district <- as.numeric(chicago_crime_clustering$district)
test <- chicago_crime_clustering
#Normalization of variables
library(RSNNS)

train_set <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)

#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,] 
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]

#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]

## Dendograms
library(tidyverse)      #data manipulation and visualization
library(class)          # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")


distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
     main = "Agglomerative, complete linkages")


library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
                  "wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
  row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
  stats.names[i] <- paste("Test", i-1)
  
  for(j in seq_along(clust.assess)){
    output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
    
  }
  
  for(d in 1:k) {
    cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
    dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
    cluster.sizes[d, i]
    
  }
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive

stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl

#confusionMatrix(train_small, )
library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))
#narcotics <- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#narcotics_tr <- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
narcotics_clustering <- subset(narcotics, select=-c(location_description))
narcotics_clustering_tr <- subset(narcotics_tr, select=-c(location_description))

types <- unique(chicago_crime_clustering$primary_type)
narcotics_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
narcotics_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#narcotics_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(narcotics_clustering$location_description))
narcotics_clustering$district <- as.numeric(narcotics_clustering$district)
test <- narcotics_clustering
#Normalization of variables
library(RSNNS)

train_set <- narcotics_clustering_tr
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)

#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,] 
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]

#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]

## Dendograms
library(tidyverse)      #data manipulation and visualization
library(class)          # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")


distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
     main = "Agglomerative, complete linkages")


library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
                  "wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
  row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
  stats.names[i] <- paste("Test", i-1)
  
  for(j in seq_along(clust.assess)){
    output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
    
  }
  
  for(d in 1:k) {
    cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
    dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
    cluster.sizes[d, i]
    
  }
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive

stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl

#confusionMatrix(train_small, )
library("ggplot2")
library("reshape2")

Attaching package: ‘reshape2’

The following object is masked from ‘package:tidyr’:

    smiths

The following objects are masked from ‘package:data.table’:

    dcast, melt
library("purrr")
library("dplyr")
# let's start with a dendrogram
library("dendextend")

---------------------
Welcome to dendextend version 1.13.4
Type citation('dendextend') for how to cite the package.

Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/

Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
Or contact: <tal.galili@gmail.com>

    To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
---------------------


Attaching package: ‘dendextend’

The following object is masked from ‘package:data.table’:

    set

The following object is masked from ‘package:stats’:

    cutree
dendro <- as.dendrogram(agg)
dendro.col <- dendro %>%
  set("branches_k_color", k = 8, value =   c("darkslategray", "darkslategray4", "darkslategray3", "gold3", "darkcyan", "cyan3", "gold3")) %>%
  set("branches_lwd", 0.6) %>%
  set("labels_colors", 
      value = c("darkslategray")) %>% 
  set("labels_cex", 0.5)
Length of color vector was shorter than the number of clusters - color vector was recycled
ggd1 <- as.ggdend(dendro.col)
ggplot(ggd1, theme = theme_minimal()) +
  labs(x = "Num. observations", y = "Height", title = "Dendrogram, k = 8")

NA
NA
NA

Arboles de Decision

## c50

library(dplyr)
library(MASS)        # for obtaining data
library(tidyverse)  # for data processing
library(rpart)      # for CART decision tree
library(rpart.plot) # for plotting CART
library(caret)      # for confusion matrix and more
library(rsample)    # for data splitting
library(data.table)
library(C50)


#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)

chicago_crime_trees <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_trees_tr <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))

library(dplyr)
chicago_crime_trees %>% mutate_if(is.factor, as.character) -> chicago_crime_trees
chicago_crime_trees_tr %>% mutate_if(is.factor, as.character) -> chicago_crime_trees_tr

chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "THEFT"] <- "TH"

chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "THEFT"] <- "TH"

#train_c50<- chicago_crime_trees_tr
#test_c50<- chicago_crime_trees

crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$primary_type <- as.factor(train_c50$primary_type)
test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(primary_type  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.2))  #Higher CF less prunning
summary(tree_result)

Call:
C5.0.formula(formula = primary_type ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.2))


C5.0 [Release 2.07 GPL Edition]     Fri May  8 13:49:43 2020
-------------------------------

Class specified by attribute `outcome'

Read 488358 cases (3 attributes) from undefined.data

Decision tree:

arrest = False: THEFT (396262/242319)
arrest = True:
:...district in {1,2,12,14,16,17,18,19,20,24,31}: THEFT (29329/19863)
    district in {3,4,5,6,7,8,9,10,11,15,22,25}: NARCOTICS (62767/36815)


Evaluation on training data (488358 cases):

        Decision Tree   
      ----------------  
      Size      Errors  

         3 298997(61.2%)   <<


        (a)    (b)    (c)    (d)    (e)    (f)    (g)    (h)    <-classified as
      -----  -----  -----  -----  -----  -----  -----  -----
                            4197         30630                  (a): class ASSAULT
                            1976         60313                  (b): class CRIMINAL DAMAGE
                             970         41627                  (c): class DECEPTIVE PRACTICE
                           25952          4001                  (d): class NARCOTICS
                            7181         65181                  (e): class ROBBERY
                            7604        163409                  (f): class THEFT
                            7881         56651                  (g): class VIOLENT CRIME
                            7006          3779                  (h): class WEAPONS VIOLATION


    Attribute usage:

    100.00% arrest
     18.86% district


Time: 0.2 secs
#Plotting the tree
plot(tree_result,subtree=NULL)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$primary_type)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$primary_type),
      "correct classified cases from", length(predictions))
[1] "The classification error in test set is: 61.0087722890678 % 47604 correct classified cases from 122089"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$primary_type)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$primary_type),
      "correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 61.2249620155705 % 189361 correct classified cases from 488358"
#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.1))  #Higher CF less prunning
summary(tree_result)

Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.1))


C5.0 [Release 2.07 GPL Edition]     Fri May  8 13:47:32 2020
-------------------------------

Class specified by attribute `outcome'

Read 488358 cases (4 attributes) from undefined.data

Decision tree:

primary_type in {NARCOTICS,WEAPONS VIOLATION}:
:...primary_type = WEAPONS VIOLATION:
:   :...arrest = False:
:   :   :...location_description <= 18:
:   :   :   :...location_description <= 14: 7 (9/6)
:   :   :   :   location_description > 14: 25 (400/355)
:   :   :   location_description > 18:
:   :   :   :...location_description <= 19: 11 (210/167)
:   :   :       location_description > 19:
:   :   :       :...location_description > 128: 11 (1516/1354)
:   :   :           location_description <= 128:
:   :   :           :...location_description > 110: 7 (534/464)
:   :   :               location_description <= 110:
:   :   :               :...location_description <= 64: 5 (50/39)
:   :   :                   location_description > 64: 4 (150/131)
:   :   arrest = True:
:   :   :...location_description > 113:
:   :       :...location_description <= 135:
:   :       :   :...location_description <= 121: 7 (1064/902)
:   :       :   :   location_description > 121: 10 (1567/1339)
:   :       :   location_description > 135:
:   :       :   :...location_description <= 141: 7 (3228/2755)
:   :       :       location_description > 141:
:   :       :       :...location_description <= 144: 10 (72/51)
:   :       :           location_description > 144: 11 (404/314)
:   :       location_description <= 113:
:   :       :...location_description <= 34:
:   :           :...location_description <= 15:
:   :           :   :...location_description <= 4: 7 (25/16)
:   :           :   :   location_description > 4: 16 (13)
:   :           :   location_description > 15:
:   :           :   :...location_description <= 17: 7 (466/413)
:   :           :       location_description > 17:
:   :           :       :...location_description <= 19: 11 (343/293)
:   :           :           location_description > 19:
:   :           :           :...location_description <= 30: 15 (23/18)
:   :           :               location_description > 30: 16 (7/4)
:   :           location_description > 34:
:   :           :...location_description <= 44: 5 (76/53)
:   :               location_description > 44:
:   :               :...location_description > 105:
:   :                   :...location_description <= 107: 4 (90/79)
:   :                   :   location_description > 107: 6 (206/174)
:   :                   location_description <= 105:
:   :                   :...location_description <= 73:
:   :                       :...location_description <= 55: 3 (41/35)
:   :                       :   location_description > 55: 1 (42/31)
:   :                       location_description > 73:
:   :                       :...location_description <= 77: 6 (134/104)
:   :                           location_description > 77:
:   :                           :...location_description <= 101: 5 (45/39)
:   :                               location_description > 101:
:   :                               :...location_description <= 103: 11 (63/49)
:   :                                   location_description > 103: 1 (7/4)
:   primary_type = NARCOTICS:
:   :...location_description > 110:
:       :...location_description > 133: 11 (10740/8406)
:       :   location_description <= 133:
:       :   :...location_description > 131: 11 (8624/4943)
:       :       location_description <= 131:
:       :       :...location_description <= 113: 11 (827/288)
:       :           location_description > 113:
:       :           :...location_description <= 121:
:       :               :...location_description <= 116: 7 (1519/1223)
:       :               :   location_description > 116: 11 (996/597)
:       :               location_description > 121:
:       :               :...location_description <= 127:
:       :                   :...location_description <= 124: 11 (110/88)
:       :                   :   location_description > 124: 18 (52/45)
:       :                   location_description > 127:
:       :                   :...location_description > 129: 10 (82/69)
:       :                       location_description <= 129:
:       :                       :...location_description <= 128: 8 (17/14)
:       :                           location_description > 128: 25 (369/327)
:       location_description <= 110:
:       :...location_description <= 19:
:           :...location_description > 16: 11 (3029/2243)
:           :   location_description <= 16:
:           :   :...location_description <= 2: 11 (230/129)
:           :       location_description > 2: 16 (128/33)
:           location_description > 19:
:           :...location_description > 107: 11 (821/674)
:               location_description <= 107:
:               :...location_description > 77:
:                   :...location_description <= 79: 16 (217/44)
:                   :   location_description > 79:
:                   :   :...location_description > 101:
:                   :       :...location_description <= 103: 11 (347/240)
:                   :       :   location_description > 103:
:                   :       :   :...location_description <= 105: 1 (70/4)
:                   :       :       location_description > 105: 10 (298/226)
:                   :       location_description <= 101:
:                   :       :...location_description <= 87: 7 (201/165)
:                   :           location_description > 87:
:                   :           :...location_description <= 90: 18 (31/16)
:                   :               location_description > 90:
:                   :               :...location_description > 98: 15 (39/31)
:                   :                   location_description <= 98:
:                   :                   :...location_description <= 92: 10 (41/28)
:                   :                       location_description > 92: 12 (59/49)
:                   location_description <= 77:
:                   :...location_description > 73: 11 (428/308)
:                       location_description <= 73:
:                       :...location_description > 62: 25 (119/106)
:                           location_description <= 62:
:                           :...location_description > 54:
:                               :...location_description > 59: 1 (54/31)
:                               :   location_description <= 59:
:                               :   :...location_description <= 57: 1 (66/49)
:                               :       location_description > 57: 6 (83/50)
:                               location_description <= 54:
:                               :...location_description > 50: 11 (128/95)
:                                   location_description <= 50:
:                                   :...location_description > 44:
:                                       :...location_description <= 48: 18 (20/9)
:                                       :   location_description > 48: 1 (13/10)
:                                       location_description <= 44:
:                                       :...location_description <= 27: [S1]
:                                           location_description > 27: [S2]
primary_type in {ASSAULT,CRIMINAL DAMAGE,ROBBERY,VIOLENT CRIME}:
:...location_description <= 19:
:   :...location_description <= 16:
:   :   :...location_description <= 2: 7 (379/308)
:   :   :   location_description > 2: 16 (777/111)
:   :   location_description > 16:
:   :   :...location_description > 18: 3 (28087/24753)
:   :       location_description <= 18:
:   :       :...location_description > 17: 18 (39/34)
:   :           location_description <= 17:
:   :           :...primary_type in {CRIMINAL DAMAGE,
:   :               :                VIOLENT CRIME}: 11 (2839/2595)
:   :               primary_type = ASSAULT:
:   :               :...arrest = False: 25 (785/709)
:   :               :   arrest = True: 11 (179/162)
:   :               primary_type = ROBBERY:
:   :               :...arrest = False: 11 (1916/1722)
:   :                   arrest = True: 19 (159/142)
:   location_description > 19:
:   :...location_description <= 109:
:       :...location_description <= 40:
:       :   :...location_description > 34:
:       :   :   :...location_description > 38: 5 (868/648)
:       :   :   :   location_description <= 38:
:       :   :   :   :...location_description <= 36: 5 (742/599)
:       :   :   :       location_description > 36:
:       :   :   :       :...primary_type = CRIMINAL DAMAGE: 19 (7/5)
:       :   :   :           primary_type in {ROBBERY,
:       :   :   :           :                VIOLENT CRIME}: 1 (193/124)
:       :   :   :           primary_type = ASSAULT:
:       :   :   :           :...arrest = False: 2 (82/53)
:       :   :   :               arrest = True: 1 (22/15)
:       :   :   location_description <= 34:
:       :   :   :...location_description <= 25:
:       :   :       :...primary_type in {ASSAULT,CRIMINAL DAMAGE,ROBBERY}:
:       :   :       :   :...location_description <= 20: 9 (49/42)
:       :   :       :   :   location_description > 20: 1 (602/506)
:       :   :       :   primary_type = VIOLENT CRIME:
:       :   :       :   :...location_description <= 21:
:       :   :       :       :...location_description <= 20: 6 (20/16)
:       :   :       :       :   location_description > 20: 1 (46/35)
:       :   :       :       location_description > 21:
:       :   :       :       :...location_description <= 24: 8 (198/173)
:       :   :       :           location_description > 24: 1 (24/17)
:       :   :       location_description > 25:
:       :   :       :...location_description <= 26:
:       :   :           :...primary_type in {ASSAULT,CRIMINAL DAMAGE,
:       :   :           :   :                ROBBERY}: 19 (470/369)
:       :   :           :   primary_type = VIOLENT CRIME: 18 (1234/866)
:       :   :           location_description > 26:
:       :   :           :...location_description > 32: 17 (176/153)
:       :   :               location_description <= 32:
:       :   :               :...location_description <= 29: 6 (314/281)
:       :   :                   location_description > 29:
:       :   :                   :...location_description <= 30: 1 (30/12)
:       :   :                       location_description > 30: [S3]
:       :   location_description > 40:
:       :   :...location_description <= 79:
:       :       :...location_description > 73:
:       :       :   :...location_description > 77: 1 (538/424)
:       :       :   :   location_description <= 77:
:       :       :   :   :...primary_type = CRIMINAL DAMAGE: 6 (557/488)
:       :       :   :       primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
:       :       :   :       :...primary_type = ASSAULT: 6 (798/697)
:       :       :   :           primary_type = ROBBERY: 11 (1721/1425)
:       :       :   :           primary_type = VIOLENT CRIME:
:       :       :   :           :...location_description <= 75: 10 (7/4)
:       :       :   :               location_description > 75: 11 (850/747)
:       :       :   location_description <= 73:
:       :       :   :...primary_type = CRIMINAL DAMAGE:
:       :       :       :...location_description <= 54:
:       :       :       :   :...location_description <= 44:
:       :       :       :   :   :...arrest = False: 2 (286/255)
:       :       :       :   :   :   arrest = True: 7 (9/6)
:       :       :       :   :   location_description > 44:
:       :       :       :   :   :...location_description <= 50:
:       :       :       :   :       :...location_description <= 46: 4 (45/39)
:       :       :       :   :       :   location_description > 46:
:       :       :       :   :       :   :...location_description <= 47: 1 (21/15)
:       :       :       :   :       :       location_description > 47: 12 (627/559)
:       :       :       :   :       location_description > 50:
:       :       :       :   :       :...location_description <= 52:
:       :       :       :   :           :...location_description <= 51: 1 (225/203)
:       :       :       :   :           :   location_description > 51: 6 (7/4)
:       :       :       :   :           location_description > 52:
:       :       :       :   :           :...arrest = True: 9 (32/27)
:       :       :       :   :               arrest = False: [S4]
:       :       :       :   location_description > 54:
:       :       :       :   :...location_description <= 60:
:       :       :       :       :...arrest = False: 24 (810/591)
:       :       :       :       :   arrest = True: 1 (74/53)
:       :       :       :       location_description > 60:
:       :       :       :       :...arrest = True: 3 (44/38)
:       :       :       :           arrest = False:
:       :       :       :           :...location_description > 64: 8 (540/468)
:       :       :       :               location_description <= 64:
:       :       :       :               :...location_description > 62: 18 (110/89)
:       :       :       :                   location_description <= 62: [S5]
:       :       :       primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
:       :       :       :...location_description <= 54:
:       :       :           :...location_description > 50:
:       :       :           :   :...location_description > 52:
:       :       :           :   :   :...location_description <= 53: 6 (1016/913)
:       :       :           :   :   :   location_description > 53: 11 (653/584)
:       :       :           :   :   location_description <= 52: [S6]
:       :       :           :   location_description <= 50:
:       :       :           :   :...location_description > 46:
:       :       :           :       :...location_description <= 48: 1 (140/98)
:       :       :           :       :   location_description > 48:
:       :       :           :       :   :...location_description <= 49: 1 (1110/952)
:       :       :           :       :       location_description > 49: 12 (336/275)
:       :       :           :       location_description <= 46:
:       :       :           :       :...arrest = True:
:       :       :           :           :...location_description <= 44: 18 (87/77)
:       :       :           :           :   location_description > 44: 5 (39/31)
:       :       :           :           arrest = False:
:       :       :           :           :...location_description > 43: 8 (91/78)
:       :       :           :               location_description <= 43: [S7]
:       :       :           location_description > 54:
:       :       :           :...location_description <= 60:
:       :       :               :...primary_type in {ASSAULT,
:       :       :               :   :                VIOLENT CRIME}: 1 (1631/1245)
:       :       :               :   primary_type = ROBBERY:
:       :       :               :   :...arrest = False: 1 (700/554)
:       :       :               :       arrest = True:
:       :       :               :       :...location_description <= 57: 1 (122/98)
:       :       :               :           location_description > 57: [S8]
:       :       :               location_description > 60:
:       :       :               :...location_description <= 64:
:       :       :                   :...location_description <= 62: 3 (230/207)
:       :       :                   :   location_description > 62: 1 (956/765)
:       :       :                   location_description > 64:
:       :       :                   :...location_description > 67:
:       :       :                       :...location_description <= 68: [S9]
:       :       :                       :   location_description > 68: [S10]
:       :       :                       location_description <= 67:
:       :       :                       :...location_description > 65: [S11]
:       :       :                           location_description <= 65:
:       :       :                           :...arrest = False: 16 (196/171)
:       :       :                               arrest = True: [S12]
:       :       location_description > 79:
:       :       :...location_description <= 89:
:       :           :...location_description > 86:
:       :           :   :...arrest = False: 18 (613/430)
:       :           :   :   arrest = True: 1 (348/212)
:       :           :   location_description <= 86:
:       :           :   :...location_description > 84:
:       :           :       :...arrest = False: 12 (782/657)
:       :           :       :   arrest = True: 18 (384/294)
:       :           :       location_description <= 84:
:       :           :       :...location_description > 81: 3 (55/49)
:       :           :           location_description <= 81:
:       :           :           :...primary_type = ASSAULT: 18 (425/389)
:       :           :               primary_type = CRIMINAL DAMAGE: 6 (225/205)
:       :           :               primary_type in {ROBBERY,
:       :           :                                VIOLENT CRIME}: 19 (1236/1138)
:       :           location_description > 89:
:       :           :...location_description > 100:
:       :               :...arrest = True:
:       :               :   :...location_description <= 106: 1 (1848/1539)
:       :               :   :   location_description > 106: 6 (1264/1133)
:       :               :   arrest = False:
:       :               :   :...primary_type = CRIMINAL DAMAGE:
:       :               :       :...location_description > 107: 1 (3235/2863)
:       :               :       :   location_description <= 107:
:       :               :       :   :...location_description <= 105: 8 (1909/1754)
:       :               :       :       location_description > 105: 19 (257/231)
:       :               :       primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
:       :               :       :...location_description <= 104:
:       :               :           :...location_description > 103: 1 (54/35)
:       :               :           :   location_description <= 103: [S13]
:       :               :           location_description > 104:
:       :               :           :...location_description > 108: 8 (2901/2669)
:       :               :               location_description <= 108:
:       :               :               :...location_description <= 105: 1 (63/52)
:       :               :                   location_description > 105: [S14]
:       :               location_description <= 100:
:       :               :...location_description > 98:
:       :                   :...primary_type = CRIMINAL DAMAGE: 2 (40/34)
:       :                   :   primary_type = ROBBERY: 20 (63/56)
:       :                   :   primary_type = VIOLENT CRIME: 15 (719/607)
:       :                   :   primary_type = ASSAULT:
:       :                   :   :...arrest = False: 2 (185/161)
:       :                   :       arrest = True: 15 (16/12)
:       :                   location_description <= 98:
:       :                   :...location_description > 96: 18 (121/99)
:       :                       location_description <= 96:
:       :                       :...location_description <= 92:
:       :                           :...location_description <= 90: 5 (54/40)
:       :                           :   location_description > 90: 12 (94/73)
:       :                           location_description > 92:
:       :                           :...arrest = True:
:       :                               :...location_description <= 94: 1 (145/71)
:       :                               :   location_description > 94: 14 (24/20)
:       :                               arrest = False:
:       :                               :...location_description <= 93: 19 (37/27)
:       :                                   location_description > 93: [S15]
:       location_description > 109:
:       :...location_description <= 121:
:           :...location_description > 119: 8 (13730/11829)
:           :   location_description <= 119:
:           :   :...location_description > 116:
:           :       :...primary_type in {ASSAULT,VIOLENT CRIME}:
:           :       :   :...arrest = False: 11 (2154/1959)
:           :       :   :   arrest = True: 7 (449/403)
:           :       :   primary_type in {CRIMINAL DAMAGE,ROBBERY}:
:           :       :   :...arrest = False:
:           :       :       :...primary_type = CRIMINAL DAMAGE: 19 (667/614)
:           :       :       :   primary_type = ROBBERY: 2 (616/568)
:           :       :       arrest = True:
:           :       :       :...primary_type = CRIMINAL DAMAGE: 19 (59/50)
:           :       :           primary_type = ROBBERY: 24 (413/370)
:           :       location_description <= 116:
:           :       :...location_description <= 111:
:           :           :...location_description <= 110: 9 (45/37)
:           :           :   location_description > 110:
:           :           :   :...primary_type = ASSAULT: 19 (84/75)
:           :           :       primary_type = VIOLENT CRIME: 11 (158/140)
:           :           :       primary_type = ROBBERY: 3 (181/140)
:           :           :       primary_type = CRIMINAL DAMAGE:
:           :           :       :...arrest = False: 11 (67/57)
:           :           :           arrest = True: 10 (61/51)
:           :           location_description > 111:
:           :           :...primary_type = ROBBERY:
:           :               :...arrest = False: 8 (11026/9771)
:           :               :   arrest = True: 5 (963/871)
:           :               primary_type in {ASSAULT,CRIMINAL DAMAGE,
:           :               :                VIOLENT CRIME}:
:           :               :...primary_type in {ASSAULT,
:           :                   :                VIOLENT CRIME}: 7 (9802/8729)
:           :                   primary_type = CRIMINAL DAMAGE:
:           :                   :...arrest = False: 8 (10185/9121)
:           :                       arrest = True: 4 (418/370)
:           location_description > 121:
:           :...location_description <= 122:
:               :...primary_type in {ASSAULT,CRIMINAL DAMAGE,
:               :   :                ROBBERY}: 1 (4301/3863)
:               :   primary_type = VIOLENT CRIME: 18 (1480/1277)
:               location_description > 122:
:               :...location_description <= 130:
:                   :...location_description > 128: 8 (7069/6410)
:                   :   location_description <= 128:
:                   :   :...location_description <= 126: 25 (20/16)
:                   :       location_description > 126:
:                   :       :...primary_type in {ASSAULT,
:                   :           :                VIOLENT CRIME}: 11 (750/668)
:                   :           primary_type = ROBBERY:
:                   :           :...arrest = False: 12 (78/68)
:                   :           :   arrest = True: 24 (90/73)
:                   :           primary_type = CRIMINAL DAMAGE:
:                   :           :...arrest = True: 9 (13/10)
:                   :               arrest = False:
:                   :               :...location_description <= 127: 8 (87/77)
:                   :                   location_description > 127: 15 (44/38)
:                   location_description > 130:
:                   :...primary_type = CRIMINAL DAMAGE:
:                       :...location_description > 134: 8 (24708/22873)
:                       :   location_description <= 134:
:                       :   :...location_description <= 133:
:                       :       :...arrest = False: 19 (368/337)
:                       :       :   arrest = True: 10 (168/150)
:                       :       location_description > 133:
:                       :       :...arrest = False: 18 (648/598)
:                       :           arrest = True: 8 (60/50)
:                       primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}:
:                       :...location_description <= 136:
:                           :...location_description <= 133:
:                           :   :...arrest = False: 11 (21858/19830)
:                           :   :   arrest = True: [S16]
:                           :   location_description > 133:
:                           :   :...location_description > 134: 19 (202/132)
:                           :       location_description <= 134: [S17]
:                           location_description > 136:
:                           :...location_description > 151:
:                               :...location_description > 152: 7 (45/37)
:                               :   location_description <= 152: [S18]
:                               location_description <= 151:
:                               :...primary_type = ASSAULT:
:                                   :...location_description > 146:
:                                   :   :...arrest = False: 10 (138/125)
:                                   :   :   arrest = True: 15 (27/22)
:                                   :   location_description <= 146:
:                                   :   :...location_description <= 139: [S19]
:                                   :       location_description > 139: [S20]
:                                   primary_type in {ROBBERY,VIOLENT CRIME}:
:                                   :...location_description <= 140:
:                                       :...location_description <= 137: 11 (21072/19062)
:                                       :   location_description > 137: [S21]
:                                       location_description > 140:
:                                       :...location_description <= 144: [S22]
:                                           location_description > 144: [S23]
primary_type in {DECEPTIVE PRACTICE,THEFT}:
:...location_description > 136:
    :...location_description > 150:
    :   :...arrest = False: 9 (251/215)
    :   :   arrest = True: 1 (36/28)
    :   location_description <= 150:
    :   :...primary_type = DECEPTIVE PRACTICE:
    :       :...arrest = False:
    :       :   :...location_description <= 138: 1 (1926/1643)
    :       :   :   location_description > 138: 18 (668/565)
    :       :   arrest = True:
    :       :   :...location_description <= 138: 7 (338/306)
    :       :       location_description > 138:
    :       :       :...location_description > 141: 3 (30/24)
    :       :           location_description <= 141:
    :       :           :...location_description <= 139: 18 (6/3)
    :       :               location_description > 139: 19 (44/35)
    :       primary_type = THEFT:
    :       :...arrest = True:
    :           :...location_description <= 138: 8 (1922/1757)
    :           :   location_description > 138:
    :           :   :...location_description > 144: 22 (118/101)
    :           :       location_description <= 144:
    :           :       :...location_description <= 139: 4 (66/54)
    :           :           location_description > 139:
    :           :           :...location_description <= 141: 19 (18/11)
    :           :               location_description > 141: 11 (8/5)
    :           arrest = False:
    :           :...location_description <= 138: 12 (53034/48284)
    :               location_description > 138:
    :               :...location_description <= 141: 18 (552/404)
    :                   location_description > 141:
    :                   :...location_description <= 143: 7 (365/302)
    :                       location_description > 143:
    :                       :...location_description <= 146: 1 (174/142)
    :                           location_description > 146:
    :                           :...location_description <= 147: 14 (4302/3949)
    :                               location_description > 147: 1 (291/254)
    location_description <= 136:
    :...location_description > 121:
        :...location_description <= 124:
        :   :...arrest = False: 1 (8654/5129)
        :   :   arrest = True: 18 (304/235)
        :   location_description > 124:
        :   :...location_description <= 131:
        :       :...location_description > 128: 8 (1795/1585)
        :       :   location_description <= 128:
        :       :   :...arrest = True: 24 (22/16)
        :       :       arrest = False:
        :       :       :...location_description <= 127: 1 (319/268)
        :       :           location_description > 127: 12 (88/74)
        :       location_description > 131:
        :       :...location_description <= 133:
        :           :...arrest = False: 1 (5383/4398)
        :           :   arrest = True:
        :           :   :...primary_type = DECEPTIVE PRACTICE: 10 (168/136)
        :           :       primary_type = THEFT: 18 (389/334)
        :           location_description > 133:
        :           :...location_description > 134: 19 (333/268)
        :               location_description <= 134:
        :               :...primary_type = DECEPTIVE PRACTICE: 18 (1467/1116)
        :                   primary_type = THEFT: 1 (9510/7853)
        location_description <= 121:
        :...location_description > 109:
            :...location_description > 117:
            :   :...location_description <= 120: 19 (5973/5206)
            :   :   location_description > 120:
            :   :   :...primary_type = THEFT: 8 (4387/4014)
            :   :       primary_type = DECEPTIVE PRACTICE:
            :   :       :...arrest = False: 16 (42/37)
            :   :           arrest = True: 19 (10/6)
            :   location_description <= 117:
            :   :...location_description > 111:
            :       :...primary_type = DECEPTIVE PRACTICE: 8 (13541/12344)
            :       :   primary_type = THEFT: 5 (10708/9898)
            :       location_description <= 111:
            :       :...location_description <= 110: 10 (65/54)
            :           location_description > 110:
            :           :...primary_type = THEFT: 11 (86/70)
            :               primary_type = DECEPTIVE PRACTICE:
            :               :...arrest = False: 9 (59/45)
            :                   arrest = True: 5 (20/15)
            location_description <= 109:
            :...location_description <= 19:
                :...location_description > 16:
                :   :...location_description <= 17: 11 (1754/1609)
                :   :   location_description > 17: 19 (11235/10092)
                :   location_description <= 16:
                :   :...location_description > 2: 16 (1665/302)
                :       location_description <= 2:
                :       :...location_description <= 1: 19 (1757/1494)
                :           location_description > 1: 7 (79/61)
                location_description > 19:
                :...location_description <= 64:
                    :...location_description > 62:
                    :   :...primary_type = DECEPTIVE PRACTICE: 18 (1288/771)
                    :   :   primary_type = THEFT: 1 (8184/5962)
                    :   location_description <= 62:
                    :   :...location_description <= 31:
                    :       :...location_description > 25: 18 (2786/1577)
                    :       :   location_description <= 25:
                    :       :   :...arrest = True:
                    :       :       :...location_description > 23: 2 (70/55)
                    :       :       :   location_description <= 23:
                    :       :       :   :...location_description <= 20: 17 (41/26)
                    :       :       :       location_description > 20: 18 (14/7)
                    :       :       arrest = False:
                    :       :       :...location_description <= 21:
                    :       :           :...location_description <= 20: 6 (77/66)
                    :       :           :   location_description > 20: 1 (1059/830)
                    :       :           location_description > 21:
                    :       :           :...location_description <= 23: 19 (1204/985)
                    :       :               location_description > 23: [S24]
                    :       location_description > 31:
                    :       :...location_description > 55:
                    :           :...location_description > 60: 6 (762/664)
                    :           :   location_description <= 60:
                    :           :   :...arrest = False: 1 (3966/2360)
                    :           :       arrest = True: [S25]
                    :           location_description <= 55:
                    :           :...location_description <= 50:
                    :               :...location_description > 46:
                    :               :   :...location_description <= 48: 1 (213/137)
                    :               :   :   location_description > 48: [S26]
                    :               :   location_description <= 46:
                    :               :   :...location_description > 34: [S27]
                    :               :       location_description <= 34: [S28]
                    :               location_description > 50:
                    :               :...location_description > 54: [S29]
                    :                   location_description <= 54:
                    :                   :...arrest = True: 6 (663/591)
                    :                       arrest = False: [S30]
                    location_description > 64:
                    :...location_description <= 77:
                        :...location_description > 74: 6 (3417/3032)
                        :   location_description <= 74:
                        :   :...location_description <= 65: 8 (979/817)
                        :       location_description > 65:
                        :       :...location_description <= 67: 1 (1995/1769)
                        :           location_description > 67:
                        :           :...location_description <= 69: 9 (87/72)
                        :               location_description > 69:
                        :               :...location_description <= 70: 1 (27/17)
                        :                   location_description > 70: 16 (54/43)
                        location_description > 77:
                        :...location_description <= 87:
                            :...location_description > 82: 18 (884/655)
                            :   location_description <= 82:
                            :   :...location_description <= 79: 1 (261/170)
                            :       location_description > 79:
                            :       :...primary_type = THEFT: 19 (5376/4774)
                            :           primary_type = DECEPTIVE PRACTICE:
                            :           :...arrest = False: 18 (438/392)
                            :               arrest = True: 31 (24/17)
                            location_description > 87:
                            :...location_description <= 90: 18 (1670/972)
                                location_description > 90:
                                :...location_description > 107:
                                    :...arrest = False: 1 (8715/7563)
                                    :   arrest = True: 8 (331/298)
                                    location_description <= 107:
                                    :...location_description > 102: 1 (1978/1288)
                                        location_description <= 102:
                                        :...location_description <= 93: [S31]
                                            location_description > 93: [S32]

SubTree [S1]

location_description <= 25: 1 (12/8)
location_description > 25: 9 (12/9)

SubTree [S2]

location_description > 36: 11 (124/87)
location_description <= 36:
:...location_description <= 34: 11 (15/11)
    location_description > 34: 5 (32/23)

SubTree [S3]

primary_type in {ASSAULT,ROBBERY,VIOLENT CRIME}: 18 (45/32)
primary_type = CRIMINAL DAMAGE: 19 (12/9)

SubTree [S4]

location_description <= 53: 7 (173/152)
location_description > 53: 1 (42/38)

SubTree [S5]

location_description <= 61: 6 (33/26)
location_description > 61: 8 (64/56)

SubTree [S6]

primary_type in {ASSAULT,VIOLENT CRIME}: 11 (784/701)
primary_type = ROBBERY:
:...arrest = False: 25 (492/451)
    arrest = True: 7 (321/284)

SubTree [S7]

location_description <= 42: 15 (4/2)
location_description > 42:
:...primary_type = ASSAULT: 2 (84/71)
    primary_type in {ROBBERY,VIOLENT CRIME}: 7 (303/272)

SubTree [S8]

location_description <= 58: 6 (297/213)
location_description > 58: 1 (87/69)

SubTree [S9]

primary_type in {ASSAULT,VIOLENT CRIME}: 9 (81/67)
primary_type = ROBBERY: 11 (54/39)

SubTree [S10]

location_description <= 70: 1 (43/30)
location_description > 70: 6 (29/24)

SubTree [S11]

primary_type in {ASSAULT,ROBBERY}: 1 (491/429)
primary_type = VIOLENT CRIME: 19 (119/100)

SubTree [S12]

primary_type in {ASSAULT,VIOLENT CRIME}: 8 (15/11)
primary_type = ROBBERY: 3 (15/12)

SubTree [S13]

primary_type in {ASSAULT,VIOLENT CRIME}: 1 (2984/2692)
primary_type = ROBBERY: 8 (1692/1525)

SubTree [S14]

location_description > 106: 11 (20/17)
location_description <= 106:
:...primary_type in {ASSAULT,VIOLENT CRIME}: 4 (1015/934)
    primary_type = ROBBERY: 1 (419/376)

SubTree [S15]

primary_type = ROBBERY:
:...location_description <= 95: 1 (24/20)
:   location_description > 95: 17 (44/35)
primary_type in {ASSAULT,CRIMINAL DAMAGE,VIOLENT CRIME}:
:...primary_type = VIOLENT CRIME: 1 (101/87)
    primary_type = ASSAULT:
    :...location_description <= 95: 12 (70/57)
    :   location_description > 95: 25 (67/60)
    primary_type = CRIMINAL DAMAGE:
    :...location_description <= 95: 16 (34/29)
        location_description > 95: 19 (86/75)

SubTree [S16]

primary_type = ASSAULT: 11 (1106/1018)
primary_type = ROBBERY: 10 (754/695)
primary_type = VIOLENT CRIME: 19 (2046/1857)

SubTree [S17]

primary_type in {ASSAULT,VIOLENT CRIME}: 1 (1582/1388)
primary_type = ROBBERY:
:...arrest = False: 25 (1418/1283)
    arrest = True: 1 (559/494)

SubTree [S18]

primary_type in {ASSAULT,VIOLENT CRIME}: 9 (100/79)
primary_type = ROBBERY: 12 (182/150)

SubTree [S19]

location_description > 138: 4 (91/77)
location_description <= 138:
:...arrest = False: 9 (5918/5476)
    arrest = True: 4 (1141/1042)

SubTree [S20]

location_description > 143: 2 (29/23)
location_description <= 143:
:...location_description <= 141: 18 (15/11)
    location_description > 141: 11 (33/27)

SubTree [S21]

location_description > 139: 18 (127/103)
location_description <= 139:
:...location_description <= 138: 4 (3/2)
    location_description > 138:
    :...primary_type = VIOLENT CRIME: 18 (210/172)
        primary_type = ROBBERY:
        :...arrest = False: 17 (105/93)
            arrest = True: 4 (69/60)

SubTree [S22]

primary_type = ROBBERY:
:...arrest = False: 7 (476/398)
:   arrest = True: 11 (45/38)
primary_type = VIOLENT CRIME:
:...arrest = False: 11 (99/76)
    arrest = True: 7 (15/9)

SubTree [S23]

location_description <= 146: 18 (127/107)
location_description > 146:
:...location_description > 147: 18 (84/76)
    location_description <= 147:
    :...primary_type = ROBBERY: 8 (508/472)
        primary_type = VIOLENT CRIME:
        :...arrest = False: 25 (683/622)
            arrest = True: 9 (95/86)

SubTree [S24]

location_description <= 24: 8 (91/77)
location_description > 24: 1 (1814/1549)

SubTree [S25]

primary_type = DECEPTIVE PRACTICE: 11 (301/183)
primary_type = THEFT: 1 (85/62)

SubTree [S26]

location_description <= 49: 1 (1772/1335)
location_description > 49: 12 (454/372)

SubTree [S27]

location_description <= 41: 5 (295/240)
location_description > 41: 2 (542/497)

SubTree [S28]

location_description <= 32: 1 (9/5)
location_description > 32:
:...location_description <= 33: 6 (134/109)
    location_description > 33: 17 (11/7)

SubTree [S29]

primary_type = DECEPTIVE PRACTICE: 1 (32/10)
primary_type = THEFT: 8 (193/143)

SubTree [S30]

location_description <= 52: 1 (1654/1431)
location_description > 52:
:...location_description > 53: 7 (366/326)
    location_description <= 53:
    :...primary_type = DECEPTIVE PRACTICE: 1 (44/38)
        primary_type = THEFT: 6 (730/661)

SubTree [S31]

location_description <= 92: 10 (12/3)
location_description > 92: 18 (114/71)

SubTree [S32]

location_description > 101: 1 (13339/11253)
location_description <= 101:
:...location_description > 99: 19 (498/430)
    location_description <= 99:
    :...location_description > 96: 18 (157/102)
        location_description <= 96:
        :...location_description <= 95: 1 (206/128)
            location_description > 95: 12 (232/205)


Evaluation on training data (488358 cases):

        Decision Tree   
      ----------------  
      Size      Errors  

       348 417035(85.4%)   <<


      Class   Cases   False   False
                        Pos     Neg
      -----   -----   -----   -----
      1       32320   75133   12783
      2       20764    1717   20547
      3       20127   25258   16702
      4       23909    2788   23609
      5       18737   12496   17341
      6       27195    9451   25880
      7       22712   17537   20083
      8       30292   88386   20639
      9       21249    6165   20672
      10      21641    2931   21139
      11      32073   73020   17631
      12      26996   50823   21783
      14      20450    3969   20093
      15      17498     730   17356
      16      16846     774   14487
      17      14975     314   14912
      18      32110   11928   27449
      19      25431   28462   21636
      20       9202      56    9195
      22      14605     101   14588
      24      14664    1050   14379
      25      24550    3929   24126
      31         12      17       5


    Attribute usage:

    100.00% primary_type
    100.00% location_description
     42.20% arrest


Time: 0.5 secs
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))
[1] "The classification error in test set is: 90.6715592723341 % 11389 correct classified cases from 122089"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 85.3953452180572 % 71323 correct classified cases from 488358"
#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(arrest  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.9))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$arrest)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$arrest),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$arrest)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$arrest),
      "correct classified cases from", length(pred_train))
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#assault <- subset(assault, select=-c(location_description))
#assault_tr <- subset(assault_tr, select=-c(location_description))

assault_tr <- subset(assault_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )
assault <- subset(assault, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )

#train_c50<- subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(assault, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.0001))  #Higher CF less prunning
summary(tree_result)

Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 1e-04))


C5.0 [Release 2.07 GPL Edition]     Fri May  8 13:51:43 2020
-------------------------------

Class specified by attribute `outcome'

Read 21338 cases (4 attributes) from undefined.data

Decision tree:

location_description <= 13:
:...location_description <= 12: 8 (529/437)
:   location_description > 12: 3 (2139/1703)
location_description > 13:
:...location_description <= 67:
    :...location_description <= 28:
    :   :...location_description <= 25: 18 (182/150)
    :   :   location_description > 25: 5 (267/200)
    :   location_description > 28:
    :   :...location_description > 55:
    :       :...location_description <= 56: 10 (127/102)
    :       :   location_description > 56: 1 (1376/1111)
    :       location_description <= 55:
    :       :...location_description <= 51: 1 (1383/1125)
    :           location_description > 51:
    :           :...location_description <= 52: 6 (561/464)
    :               location_description > 52:
    :               :...location_description <= 53: 1 (130/72)
    :                   location_description > 53: 6 (264/229)
    location_description > 67:
    :...location_description <= 75:
        :...location_description <= 71: 4 (622/542)
        :   location_description > 71: 5 (3744/3222)
        location_description > 75:
        :...location_description <= 76: 1 (644/505)
            location_description > 76:
            :...location_description <= 81: 4 (1480/1288)
                location_description > 81:
                :...location_description > 83: 9 (4532/3983)
                    location_description <= 83:
                    :...location_description <= 82: 4 (2819/2481)
                        location_description > 82: 1 (539/441)


Evaluation on training data (21338 cases):

        Decision Tree   
      ----------------  
      Size      Errors  

        17 18055(84.6%)   <<


       (a)   (b)   (c)   (d)   (e)   (f)   (g)   (h)   (i)   (j)   (k)   (l)    <-classified as
      ----  ----  ----  ----  ----  ----  ----  ----  ----  ----  ----  ----
       818    43   245    87    33          10   262     5          24          (a): class 1
       245   436   444   287    86          36   360     1          12          (b): class 3
       339   252   610   525    69          56   482    22          12          (c): class 4
       203   107   432   589    55          27   333     8          10          (d): class 5
       441   334   514   419   132          42   417                20          (e): class 6
       297   210   468   526   128          56   465    18           9          (f): class 7
       396   141   525   382    53          92   492    17          20          (g): class 8
       303   129   402   307    81          74   549                19          (h): class 9
       221   173   461   267    47          60   432    25           9          (i): class 10
       199   213   324   236    46          33   289    17           6          (j): class 15
       439    72   221   142    47          18   206    14          32          (k): class 18
       171    29   275   244    48          25   245                 9          (l): class 22


    Attribute usage:

    100.00% location_description


Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))
[1] "The classification error in test set is: 91.1886014248219 % 470 correct classified cases from 5334"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 84.6143031211922 % 3283 correct classified cases from 21338"
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#criminal_damage <- subset(criminal_damage, select=-c(location_description))
#criminal_damage_tr <- subset( criminal_damage_tr, select=-c(location_description))
criminal_damage_tr <- subset(criminal_damage_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18"  | district == "9" | district=="20" | district == "25" | district == "11")
criminal_damage <- subset(criminal_damage, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18"  | district == "9" | district=="20" | district == "25" | district == "11")

train_c50<- subset(criminal_damage_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(criminal_damage, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)

#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.001))  #Higher CF less prunning
summary(tree_result)

Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.001))


C5.0 [Release 2.07 GPL Edition]     Wed May  6 16:43:00 2020
-------------------------------

Class specified by attribute `outcome'

Read 45588 cases (4 attributes) from undefined.data

Decision tree:

location_description in {ABANDONED BUILDING,
:                        AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA,
:                        AIRPORT BUILDING NON-TERMINAL - SECURE AREA,
:                        AIRPORT EXTERIOR - NON-SECURE AREA,
:                        AIRPORT EXTERIOR - SECURE AREA,AIRPORT PARKING LOT,
:                        AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,
:                        AIRPORT TERMINAL LOWER LEVEL - SECURE AREA,
:                        AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA,
:                        AIRPORT TERMINAL UPPER LEVEL - SECURE AREA,
:                        ANIMAL HOSPITAL,APPLIANCE STORE,CHA APARTMENT,
:                        CHA HALLWAY/STAIRWELL/ELEVATOR,
:                        CHA PARKING LOT/GROUNDS,
:                        CHURCH/SYNAGOGUE/PLACE OF WORSHIP,
:                        COIN OPERATED MACHINE,CREDIT UNION,
:                        CTA GARAGE / OTHER PROPERTY,CTA TRACKS - RIGHT OF WAY,
:                        CTA TRAIN,DRIVEWAY - RESIDENTIAL,
:                        FACTORY/MANUFACTURING BUILDING,FIRE STATION,
:                        FOREST PRESERVE,HIGHWAY/EXPRESSWAY,
:                        OTHER RAILROAD PROP / TRAIN DEPOT,PAWN SHOP,POOL ROOM,
:                        RESIDENCE,RESIDENCE PORCH/HALLWAY,RESIDENCE-GARAGE,
:                        RESIDENTIAL YARD (FRONT/BACK),SAVINGS AND LOAN,
:                        SCHOOL PUBLIC BUILDING,SCHOOL PUBLIC GROUNDS,
:                        VACANT LOT/LAND,
:                        VEHICLE - DELIVERY TRUCK}: 8 (14667/12108)
location_description in {APARTMENT,CTA BUS,CURRENCY EXCHANGE,
:                        GAS STATION}: 3 (5871/4778)
location_description in {ALLEY,ATHLETIC CLUB,ATM (AUTOMATIC TELLER MACHINE),
:                        AUTO / BOAT / RV DEALERSHIP,BANK,BAR OR TAVERN,
:                        BARBERSHOP,BOAT/WATERCRAFT,BOWLING ALLEY,BRIDGE,
:                        CAR WASH,CEMETARY,CLEANING STORE,
:                        COLLEGE/UNIVERSITY GROUNDS,
:                        COLLEGE/UNIVERSITY RESIDENCE HALL,
:                        COMMERCIAL / BUSINESS OFFICE,CONSTRUCTION SITE,
:                        CONVENIENCE STORE,CTA BUS STOP,CTA PLATFORM,
:                        CTA STATION,DAY CARE CENTER,DEPARTMENT STORE,
:                        DRUG STORE,FEDERAL BUILDING,
:                        GOVERNMENT BUILDING/PROPERTY,GROCERY FOOD STORE,
:                        HOSPITAL BUILDING/GROUNDS,HOTEL/MOTEL,
:                        JAIL / LOCK-UP FACILITY,
:                        LAKEFRONT/WATERFRONT/RIVERBANK,LIBRARY,
:                        MEDICAL/DENTAL OFFICE,MOVIE HOUSE/THEATER,
:                        NURSING HOME/RETIREMENT HOME,OTHER,
:                        OTHER COMMERCIAL TRANSPORTATION,PARK PROPERTY,
:                        PARKING LOT/GARAGE(NON.RESID.),
:                        POLICE FACILITY/VEH PARKING LOT,RESTAURANT,
:                        SCHOOL PRIVATE BUILDING,SCHOOL PRIVATE GROUNDS,
:                        SIDEWALK,SMALL RETAIL STORE,SPORTS ARENA/STADIUM,
:                        STREET,TAVERN/LIQUOR STORE,TAXICAB,
:                        VEHICLE - OTHER RIDE SERVICE,
:                        VEHICLE - OTHER RIDE SHARE SERVICE (),
:                        VEHICLE NON-COMMERCIAL,VEHICLE-COMMERCIAL,WAREHOUSE}:
:...location_description in {ALLEY,CAR WASH,DAY CARE CENTER,
    :                        NURSING HOME/RETIREMENT HOME,SPORTS ARENA/STADIUM,
    :                        STREET,VEHICLE NON-COMMERCIAL}: 8 (17364/15004)
    location_description in {ATHLETIC CLUB,ATM (AUTOMATIC TELLER MACHINE),
                             AUTO / BOAT / RV DEALERSHIP,BANK,BAR OR TAVERN,
                             BARBERSHOP,BOAT/WATERCRAFT,BOWLING ALLEY,BRIDGE,
                             CEMETARY,CLEANING STORE,
                             COLLEGE/UNIVERSITY GROUNDS,
                             COLLEGE/UNIVERSITY RESIDENCE HALL,
                             COMMERCIAL / BUSINESS OFFICE,CONSTRUCTION SITE,
                             CONVENIENCE STORE,CTA BUS STOP,CTA PLATFORM,
                             CTA STATION,DEPARTMENT STORE,DRUG STORE,
                             FEDERAL BUILDING,GOVERNMENT BUILDING/PROPERTY,
                             GROCERY FOOD STORE,HOSPITAL BUILDING/GROUNDS,
                             HOTEL/MOTEL,JAIL / LOCK-UP FACILITY,
                             LAKEFRONT/WATERFRONT/RIVERBANK,LIBRARY,
                             MEDICAL/DENTAL OFFICE,MOVIE HOUSE/THEATER,OTHER,
                             OTHER COMMERCIAL TRANSPORTATION,PARK PROPERTY,
                             PARKING LOT/GARAGE(NON.RESID.),
                             POLICE FACILITY/VEH PARKING LOT,RESTAURANT,
                             SCHOOL PRIVATE BUILDING,SCHOOL PRIVATE GROUNDS,
                             SIDEWALK,SMALL RETAIL STORE,TAVERN/LIQUOR STORE,
                             TAXICAB,VEHICLE - OTHER RIDE SERVICE,
                             VEHICLE - OTHER RIDE SHARE SERVICE (),
                             VEHICLE-COMMERCIAL,WAREHOUSE}: 1 (7686/6506)


Evaluation on training data (45588 cases):

        Decision Tree   
      ----------------  
      Size      Errors  

         4 38396(84.2%)   <<


       (a)   (b)   (c)   (d)   (e)   (f)   (g)   (h)   (i)   (j)   (k)   (l)    <-classified as
      ----  ----  ----  ----  ----  ----  ----  ----  ----  ----  ----  ----
      1180    65                          1004                                  (a): class 1
       451  1093                          2304                                  (b): class 3
       582   611                          3429                                  (c): class 4
       402   308                          2987                                  (d): class 5
       613   878                          3080                                  (e): class 6
       354   584                          2820                                  (f): class 7
       896   438                          4919                                  (g): class 8
       648   420                          2969                                  (h): class 9
       548   773                          2873                                  (i): class 11
      1051    96                          1497                                  (j): class 18
       341   173                           835                                  (k): class 20
       620   432                          3314                                  (l): class 25


    Attribute usage:

    100.00% location_description


Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
Error in model.frame.default(object$Terms, newdata, na.action = na.action,  : 
  factor location_description has new levels FARM
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#deceptive_practice <- subset(deceptive_practice, select=-c(location_description))
#deceptive_practice_tr <- subset(deceptive_practice_tr, select=-c(location_description))

deceptive_practice_tr <- subset(deceptive_practice_tr, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20"  | district == "9" | district=="21" | district == "25" | district == "11" )

deceptive_practice <- subset(deceptive_practice, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20"  | district == "9" | district=="21" | district == "25" | district == "11" )

#train_c50<- subset(deceptive_practice_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(deceptive_practice_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.5))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))
[1] "The classification error in test set is: 79.3173028417689 % 1230 correct classified cases from 5947"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 78.8584398117014 % 5030 correct classified cases from 23792"
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#narcotics <- subset(narcotics, select=-c(location_description))
#narcotics_tr <- subset(narcotics_tr, select=-c(location_description))
#narcotics_tr <- subset(narcotics_tr, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")
#narcotics <- subset(narcotics, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")

#train_c50<- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(narcotics_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
summary(tree_result)

Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF = 0.01))


C5.0 [Release 2.07 GPL Edition]     Fri May  8 13:54:54 2020
-------------------------------

Class specified by attribute `outcome'

Read 16510 cases (3 attributes) from undefined.data

Decision tree:
 11 (16510/7867)


Evaluation on training data (16510 cases):

        Decision Tree   
      ----------------  
      Size      Errors  

         1 7867(47.6%)   <<


       (a)   (b)   (c)   (d)   (e)   (f)    <-classified as
      ----  ----  ----  ----  ----  ----
                         364                (a): class 1
                        1199                (b): class 8
                        3178                (c): class 10
                        8643                (d): class 11
                        2769                (e): class 15
                         357                (f): class 18


Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))
[1] "The classification error in test set is: 48.1948146353283 % 2138 correct classified cases from 4127"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 47.6499091459721 % 8643 correct classified cases from 16510"
## Creating a training and test datasets
set.seed(1234)
#robbery <- subset(robbery, select=-c(location_description))
#robbery_tr <- subset(robbery_tr, select=-c(location_description))
robbery_tr <- subset(robbery_tr, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")
robbery <- subset(robbery, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")

#robbery$location_description <- gsub("PARKING LOT","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKING LOT","PARKING",robbery_tr$location_description)
#robbery$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery_tr$location_description)

train_c50<- subset(robbery_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(robbery, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
train_c50$location_description <- factor(train_c50$location_description)
test_c50$location_description <- factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~., data=train_c50, control = C5.0Control(noGlobalPruning = TRUE, CF= 0.000001))  #Higher CF less prunning
summary(tree_result)

Call:
C5.0.formula(formula = district ~ ., data = train_c50, control = C5.0Control(noGlobalPruning = TRUE, CF = 1e-06))


C5.0 [Release 2.07 GPL Edition]     Wed May  6 17:02:50 2020
-------------------------------

Class specified by attribute `outcome'

Read 41594 cases (4 attributes) from undefined.data

Decision tree:

arrest = True:
:...location_description > 73:
:   :...location_description > 77:
:   :   :...location_description <= 85: 1 (1478/1201)
:   :   :   location_description > 85: 11 (738/598)
:   :   location_description <= 77:
:   :   :...location_description > 75: 8 (359/272)
:   :       location_description <= 75:
:   :       :...location_description <= 74: 5 (550/441)
:   :           location_description > 74: 11 (170/134)
:   location_description <= 73:
:   :...location_description > 54:
:       :...location_description > 66:
:       :   :...location_description <= 69: 1 (139/100)
:       :   :   location_description > 69: 6 (451/340)
:       :   location_description <= 66:
:       :   :...location_description > 56: 1 (648/338)
:       :       location_description <= 56:
:       :       :...location_description <= 55: 1 (2/1)
:       :           location_description > 55: 18 (101/38)
:       location_description <= 54:
:       :...location_description <= 37:
:           :...location_description > 12: 1 (885/655)
:           :   location_description <= 12:
:           :   :...location_description > 6: 6 (469/380)
:           :       location_description <= 6:
:           :       :...location_description <= 2: 5 (58/39)
:           :           location_description > 2: 8 (6)
:           location_description > 37:
:           :...location_description > 51: 11 (1040/784)
:               location_description <= 51:
:               :...location_description > 39: 1 (343/242)
:                   location_description <= 39:
:                   :...location_description <= 38: 1 (70/41)
:                       location_description > 38: 6 (297/185)
arrest = False:
:...location_description > 82: 11 (11217/9162)
    location_description <= 82:
    :...location_description > 70: 8 (11291/8218)
        location_description <= 70:
        :...location_description <= 11:
            :...location_description <= 9: 11 (1201/932)
            :   location_description > 9: 6 (4744/3827)
            location_description > 11:
            :...location_description <= 25:
                :...location_description > 22: 5 (188/108)
                :   location_description <= 22:
                :   :...location_description > 17: 8 (85/69)
                :       location_description <= 17:
                :       :...location_description <= 16: 1 (139/102)
                :           location_description > 16: 18 (60/32)
                location_description > 25:
                :...location_description > 63: 8 (1904/1546)
                    location_description <= 63:
                    :...location_description > 54:
                        :...location_description <= 58: 18 (159/89)
                        :   location_description > 58: 11 (67/53)
                        location_description <= 54:
                        :...location_description > 46: 11 (660/489)
                            location_description <= 46:
                            :...location_description > 36: 1 (904/637)
                                location_description <= 36:
                                :...location_description > 32: 11 (584/482)
                                    location_description <= 32:
                                    :...location_description > 28: 18 (473/350)
                                        location_description <= 28:
                                        :...location_description <= 26: 11 (91/75)
                                            location_description > 26: 8 (23/13)


Evaluation on training data (41594 cases):

        Decision Tree   
      ----------------  
      Size      Errors  

        35 31973(76.9%)   <<


       (a)   (b)   (c)   (d)   (e)   (f)   (g)   (h)   (i)    <-classified as
      ----  ----  ----  ----  ----  ----  ----  ----  ----
      1291          28   249   649        1145         173    (a): class 1
       341         102  1023  1905        1586          64    (b): class 4
       315         208   314  1598        1344          36    (c): class 5
       450          90  1229  1796        2295          61    (d): class 6
       390          90   666  3550        1944          64    (e): class 8
       416          80   722  1601        1497          42    (f): class 9
       435          99   799  1081        3059          44    (g): class 11
       184          45   574   699        1652          25    (h): class 15
       786          54   385   789        1246         284    (i): class 18


    Attribute usage:

    100.00% location_description
    100.00% arrest


Time: 0.0 secs
#Plotting the tree
plot(tree_result,subtree=NULL)


## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))
[1] "The classification error in test set is: 82.5898772193269 % 1971 correct classified cases from 11321"
pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
[1] "The classification error in train set is: 76.8692599894216 % 9621 correct classified cases from 41594"
## Creating a training and test datasets
set.seed(1234)
theft <- subset(theft, select=-c(location_description))
theft_tr <- subset(theft_tr, select=-c(location_description))
theft_tr <- subset(theft_tr, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")
theft <- subset(theft, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")

train_c50<- subset(theft_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(theft, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
set.seed(1234)
#violent_crime <- subset(violent_crime, select=-c(location_description))
#violent_tr_crime <- subset(violent_tr_crime, select=-c(location_description))
violent_tr_crime <- subset(violent_tr_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20"  | district == "10" | district=="21" | district == "25" | district == "2")
violent_crime <- subset(violent_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20"  | district == "10" | district=="21" | district == "25" | district == "2")

train_c50<- subset(violent_tr_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(violent_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, trials = 10,control = C5.0Control(noGlobalPruning = FALSE, CF= 0.000001))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL,trial=9)
Only 1 trials are in the model. Setting 'trial' to 0 (the plot code is zero-based).

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")
Error in model.frame.default(object$Terms, newdata, na.action = na.action,  : 
  factor location_description has new levels , ATM (AUTOMATIC TELLER MACHINE), CHA PLAY LOT, HORSE STABLE, TRAILER, VEHICLE - DELIVERY TRUCK, VEHICLE-COMMERCIAL - TROLLEY BUS
set.seed(1234)
#weapons_violation <- subset(weapons_violation, select=-c(location_description))
#weapons_violation_tr <- subset(weapons_violation_tr, select=-c(location_description))
weapons_violation_tr <- subset(weapons_violation_tr, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")
weapons_violation <- subset(weapons_violation, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")

train_c50<- subset(weapons_violation_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(weapons_violation, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
#Tunning the m number of predictors
tuning_rf_mtry <- function(df, y, ntree = 100){
  
  require(dplyr)
  max_predictors <- ncol(df) - 1
  n_predictors   <- rep(NA, max_predictors)
  oob_err_rate   <- rep(NA, max_predictors)
  for (i in 1:max_predictors) {
    set.seed(123)
    f <- formula(paste(y,"~ ."))
    model_rf <- randomForest(formula = f, data = df, mtry = i, ntree = ntree)
    n_predictors[i] <- i
    oob_err_rate[i] <- tail(model_rf$err.rate[,1], n = 1)
  }
  results <- data_frame(n_predictors, oob_err_rate)
  return(results)
}

hiperparameter_mtry <-  tuning_rf_mtry(df = RF_train, y = "primary_type")
Error in oob_err_rate[i] <- tail(model_rf$err.rate[, 1], n = 1) : 
  replacement has length zero

REDES NEURONALES

## REDES
chicago_crime_nn <- subset(chicago_crime_subset, select=-c(case_number,block,ward,day,latitude,longitude))
chicago_crime_nn_tr <- subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- chicago_crime_nn_tr$primary_type
chicago_crime_nn_tr <- subset(chicago_crime_nn_tr, select=-c(primary_type))

target_test <- chicago_crime_nn$primary_type
chicago_crime_nn <- subset(chicago_crime_nn, select=-c(primary_type))

chicago_crime_nn_tr$arrest <- as.factor(chicago_crime_nn_tr$arrest)
chicago_crime_nn_tr$arrest <- as.numeric(chicago_crime_nn_tr$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr$district <- as.numeric(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
chicago_crime_nn_tr$description <- as.factor(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$description <- as.numeric(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$location_description <- as.factor(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$location_description <- as.numeric(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$month <- as.numeric(chicago_crime_nn_tr$month)
#chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)

chicago_crime_nn$arrest <- as.factor(chicago_crime_nn$arrest)
chicago_crime_nn$arrest <- as.numeric(chicago_crime_nn$arrest)
#chicago_crime_nn$primary_type <- as.numeric(chicago_crime_nn$primary_type)
chicago_crime_nn$district <- as.numeric(chicago_crime_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
chicago_crime_nn$description <- as.factor(chicago_crime_nn$description)
chicago_crime_nn$description <- as.numeric(chicago_crime_nn$description)
chicago_crime_nn$location_description <- as.factor(chicago_crime_nn$location_description)
chicago_crime_nn$location_description <- as.numeric(chicago_crime_nn$location_description)
chicago_crime_nn$month <- as.numeric(chicago_crime_nn$month)
#chicago_crime_nn$month <- as.factor(chicago_crime_nn$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
train <- chicago_crime_nn_tr[index,]
test <- chicago_crime_nn_tr[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train[1:100000,], grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)
[1] "description"          "location_description" "arrest"               "district"             "month"               
# main characteristics of the map
summary(crime.som)
SOM of size 8x8 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 100000 objects.
Mean distance to the closest unit in the map: 78.082.
#Showing the training process
plot(crime.som, type="changes")


#node counts
plot(crime.som, type="counts", main="Examples per Neuron")


#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}
par(mfrow=c(2,3))

plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
     main=colnames(getCodes(crime.som, 1))[1],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
     main=colnames(getCodes(crime.som, 1))[2],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
     main=colnames(getCodes(crime.som, 1))[3],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,4],
     main=colnames(getCodes(crime.som, 1))[4],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,5],
     main=colnames(getCodes(crime.som, 1))[5],
     palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,6],
#     main=colnames(getCodes(crime.som, 1))[6],
#     palette.name=coolBlueHOtRed)

# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}

par(mfrow=c(5,3))

for (j in 1:ncol(train)) {
plot(crime.som, type="property", property=crime.som$codes[[1]][,j],
     palette.name=coolBlueHotRed,
     main=colnames(train)[j], cex=0.5)
}


#som.prediction <- predict(crime.som, test)
#Clustering patterns in the map
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = rainbow(groups)[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)


#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
#index <- sample(nrow(target_train.sc), round(0.75*nrow(crime.sc)))
#train <- target_train.sc
#test <- target_test.sc

#train <- chicago_crime_nn_tr
#test <- chicago_crime_nn
#index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
#train <- chicago_crime_nn_tr[index,]
#test <- chicago_crime_nn_tr[-index,]
#train_label<-target_train
#test_label<-target_test

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")


#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))


plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))


# Plotting classes in neurons
plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)


#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics

                    
train_label          ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY  THEFT VIOLENT CRIME WEAPONS VIOLATION
  ASSAULT               4120               0                 15         0    1442   3264         23831                 0
  CRIMINAL DAMAGE        327           52639               1472         2    3088    322           259               120
  DECEPTIVE PRACTICE     246             458              10222         0    9272  14838          4850                24
  NARCOTICS               94               0                206     16574     621   9233          1396                 0
  ROBBERY                555           17038               1937         0   30716   8157          8735               688
  THEFT                  130            1345               1644         0    2375 142814         12255                 0
  VIOLENT CRIME         4098              56                894        70    1847   7334         45993                20
  WEAPONS VIOLATION       78            3119                  1        84    1459     93           774              4591

Overall Statistics
                                          
               Accuracy : 0.672           
                 95% CI : (0.6706, 0.6734)
    No Information Rate : 0.4064          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5821          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       

Statistics by Class:

                     Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity                0.427032                 0.7051                   0.62363          0.99068        0.60441       0.7676
Specificity                0.936294                 0.9854                   0.93275          0.97382        0.90882       0.9347
Pos Pred Value             0.126102                 0.9040                   0.25613          0.58932        0.45286       0.8895
Neg Pred Value             0.986998                 0.9449                   0.98524          0.99964        0.94845       0.8545
Prevalence                 0.021073                 0.1631                   0.03580          0.03654        0.11100       0.4064
Detection Rate             0.008999                 0.1150                   0.02233          0.03620        0.06709       0.3119
Detection Prevalence       0.071362                 0.1272                   0.08717          0.06143        0.14815       0.3507
Balanced Accuracy          0.681663                 0.8453                   0.77819          0.98225        0.75662       0.8511
                     Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity                        0.4689                  0.84347
Specificity                        0.9602                  0.98760
Pos Pred Value                     0.7626                  0.45014
Neg Pred Value                     0.8689                  0.99810
Prevalence                         0.2143                  0.01189
Detection Rate                     0.1005                  0.01003
Detection Prevalence               0.1317                  0.02228
Balanced Accuracy                  0.7145                  0.91554
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics

                    
test_label           ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
  ASSAULT               1293               0                 11         0     482  1117          7937                 0
  CRIMINAL DAMAGE        100           17536                532         0    1044    86            93                38
  DECEPTIVE PRACTICE      79             146               3319         0    3092  5130          1598                 6
  NARCOTICS               33               0                 74      5598     207  3013           466                 0
  ROBBERY                219            5663                632         0   10195  2713          2872               242
  THEFT                   51             430                567         0     831 47387          4179                 0
  VIOLENT CRIME         1374              21                310        17     624  2447         15460                 9
  WEAPONS VIOLATION       35            1015                  0        24     507    28           224              1506

Overall Statistics
                                          
               Accuracy : 0.6703          
                 95% CI : (0.6679, 0.6726)
    No Information Rate : 0.4057          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.58            
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity                0.406093                 0.7068                   0.60955          0.99273         0.6003       0.7653
Specificity                0.936110                 0.9852                   0.93170          0.97419         0.9090       0.9332
Pos Pred Value             0.119280                 0.9026                   0.24824          0.59610         0.4524       0.8866
Neg Pred Value             0.986662                 0.9454                   0.98473          0.99971         0.9478       0.8534
Prevalence                 0.020863                 0.1626                   0.03568          0.03695         0.1113       0.4057
Detection Rate             0.008472                 0.1149                   0.02175          0.03668         0.0668       0.3105
Detection Prevalence       0.071030                 0.1273                   0.08761          0.06154         0.1477       0.3502
Balanced Accuracy          0.671101                 0.8460                   0.77063          0.98346         0.7547       0.8492
                     Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity                        0.4709                 0.836202
Specificity                        0.9599                 0.987846
Pos Pred Value                     0.7630                 0.451033
Neg Pred Value                     0.8688                 0.998024
Prevalence                         0.2151                 0.011801
Detection Rate                     0.1013                 0.009868
Detection Prevalence               0.1328                 0.021879
Balanced Accuracy                  0.7154                 0.912024
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics

                    
test_label           ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
  ASSAULT               1293               0                 11         0     482  1117          7937                 0
  CRIMINAL DAMAGE        100           17536                532         0    1044    86            93                38
  DECEPTIVE PRACTICE      79             146               3319         0    3092  5130          1598                 6
  NARCOTICS               33               0                 74      5598     207  3013           466                 0
  ROBBERY                219            5663                632         0   10195  2713          2872               242
  THEFT                   51             430                567         0     831 47387          4179                 0
  VIOLENT CRIME         1374              21                310        17     624  2447         15460                 9
  WEAPONS VIOLATION       35            1015                  0        24     507    28           224              1506

Overall Statistics
                                          
               Accuracy : 0.6703          
                 95% CI : (0.6679, 0.6726)
    No Information Rate : 0.4057          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.58            
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity                0.406093                 0.7068                   0.60955          0.99273         0.6003       0.7653
Specificity                0.936110                 0.9852                   0.93170          0.97419         0.9090       0.9332
Pos Pred Value             0.119280                 0.9026                   0.24824          0.59610         0.4524       0.8866
Neg Pred Value             0.986662                 0.9454                   0.98473          0.99971         0.9478       0.8534
Prevalence                 0.020863                 0.1626                   0.03568          0.03695         0.1113       0.4057
Detection Rate             0.008472                 0.1149                   0.02175          0.03668         0.0668       0.3105
Detection Prevalence       0.071030                 0.1273                   0.08761          0.06154         0.1477       0.3502
Balanced Accuracy          0.671101                 0.8460                   0.77063          0.98346         0.7547       0.8492
                     Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity                        0.4709                 0.836202
Specificity                        0.9599                 0.987846
Pos Pred Value                     0.7630                 0.451033
Neg Pred Value                     0.8688                 0.998024
Prevalence                         0.2151                 0.011801
Detection Rate                     0.1013                 0.009868
Detection Prevalence               0.1328                 0.021879
Balanced Accuracy                  0.7154                 0.912024
## REDES
chicago_crime_nn2 <- subset(chicago_crime_subset, select=-c(description,case_number,block,day,latitude,longitude))
chicago_crime_nn_tr2 <- subset(chicago_crime_subset_tr, select=-c(description,case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train2 <- chicago_crime_nn_tr2$primary_type
chicago_crime_nn_tr2 <- subset(chicago_crime_nn_tr2, select=-c(primary_type))

target_test2 <- chicago_crime_nn2$primary_type
chicago_crime_nn2 <- subset(chicago_crime_nn2, select=-c(primary_type))

chicago_crime_nn_tr2$arrest <- as.factor(chicago_crime_nn_tr2$arrest)
chicago_crime_nn_tr2$arrest <- as.numeric(chicago_crime_nn_tr2$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr2$district <- as.numeric(chicago_crime_nn_tr2$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr2$description <- as.factor(chicago_crime_nn_tr2$description)
#chicago_crime_nn_tr2$description <- as.numeric(chicago_crime_nn_tr2$description)
chicago_crime_nn_tr2$location_description <- as.factor(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$location_description <- as.numeric(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$month <- as.numeric(chicago_crime_nn_tr2$month)
chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)

chicago_crime_nn2$arrest <- as.factor(chicago_crime_nn2$arrest)
chicago_crime_nn2$arrest <- as.numeric(chicago_crime_nn2$arrest)
#chicago_crime_nn$primary_type <- as.numeric(chicago_crime_nn$primary_type)
chicago_crime_nn2$district <- as.numeric(chicago_crime_nn2$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
#chicago_crime_nn2$description <- as.factor(chicago_crime_nn2$description)
#chicago_crime_nn2$description <- as.numeric(chicago_crime_nn2$description)
chicago_crime_nn2$location_description <- as.factor(chicago_crime_nn2$location_description)
chicago_crime_nn2$location_description <- as.numeric(chicago_crime_nn2$location_description)
chicago_crime_nn2$month <- as.numeric(chicago_crime_nn2$month)
chicago_crime_nn$month <- as.factor(chicago_crime_nn$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr2), round(0.75*nrow(chicago_crime_nn_tr2)))
train <- chicago_crime_nn_tr2[index,]
test <- chicago_crime_nn_tr2[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train2[index]
test_label<-target_train2[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train[1:100000,], grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)
[1] "location_description" "arrest"               "district"             "ward"                 "month"               
# main characteristics of the map
summary(crime.som)
SOM of size 8x8 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 100000 objects.
Mean distance to the closest unit in the map: 31.207.
#Showing the training process
plot(crime.som, type="changes")


#node counts
plot(crime.som, type="counts", main="Examples per Neuron")


#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}
par(mfrow=c(2,3))

plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
     main=colnames(getCodes(crime.som, 1))[1],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
     main=colnames(getCodes(crime.som, 1))[2],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
     main=colnames(getCodes(crime.som, 1))[3],
     palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,4],
#     main=colnames(getCodes(crime.som, 1))[4],
#     palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,5],
#     main=colnames(getCodes(crime.som, 1))[5],
#     palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,6],
#     main=colnames(getCodes(crime.som, 1))[6],
#     palette.name=coolBlueHOtRed)

# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}

par(mfrow=c(5,3))

for (j in 1:ncol(train)) {
plot(crime.som, type="property", property=crime.som$codes[[1]][,j],
     palette.name=coolBlueHotRed,
     main=colnames(train)[j], cex=0.5)
}

#Clustering patterns in the map
library(RColorBrewer)
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = brewer.pal(groups, name="YlGnBu")[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
#index <- sample(nrow(target_train.sc), round(0.75*nrow(crime.sc)))
#train <- target_train.sc
#test <- target_test.sc

#train <- chicago_crime_nn_tr
#test <- chicago_crime_nn
#index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
#train <- chicago_crime_nn_tr[index,]
#test <- chicago_crime_nn_tr[-index,]
#train_label<-target_train
#test_label<-target_test

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")


#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))


plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))


# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics

                    
train_label          ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY  THEFT VIOLENT CRIME WEAPONS VIOLATION
  ASSAULT                  0               0                327      1453    5590  22496          2806                 0
  CRIMINAL DAMAGE          0               0                853       663   11935  44081           697                 0
  DECEPTIVE PRACTICE       0               0               2343       640    8667  27933           327                 0
  NARCOTICS                0               0                 87      5353    3029  18090          1565                 0
  ROBBERY                  0               0               1297      2509   16747  44776          2497                 0
  THEFT                    0               0               1322      1954   12329 142118          2840                 0
  VIOLENT CRIME            0               0                590      3873   10745  39590          5514                 0
  WEAPONS VIOLATION        0               0                 31       649    1778   7098           643                 0

Overall Statistics
                                          
               Accuracy : 0.3758          
                 95% CI : (0.3744, 0.3772)
    No Information Rate : 0.7561          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.1127          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity                      NA                     NA                  0.342044          0.31315        0.23647       0.4105
Specificity                 0.92864                 0.8728                  0.916700          0.94833        0.86802       0.8348
Pos Pred Value                   NA                     NA                  0.058707          0.19034        0.24691       0.8851
Neg Pred Value                   NA                     NA                  0.989216          0.97268        0.86135       0.3135
Prevalence                  0.00000                 0.0000                  0.014962          0.03734        0.15468       0.7561
Detection Rate              0.00000                 0.0000                  0.005118          0.01169        0.03658       0.3104
Detection Prevalence        0.07136                 0.1272                  0.087171          0.06143        0.14815       0.3507
Balanced Accuracy                NA                     NA                  0.629372          0.63074        0.55225       0.6227
                     Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity                       0.32648                       NA
Specificity                       0.87573                  0.97772
Pos Pred Value                    0.09142                       NA
Neg Pred Value                    0.97139                       NA
Prevalence                        0.03689                  0.00000
Detection Rate                    0.01204                  0.00000
Detection Prevalence              0.13173                  0.02228
Balanced Accuracy                 0.60111                       NA
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics

                    
test_label           ASSAULT CRIMINAL DAMAGE DECEPTIVE PRACTICE NARCOTICS ROBBERY THEFT VIOLENT CRIME WEAPONS VIOLATION
  ASSAULT                  0               0                 91       514    1938  7375           922                 0
  CRIMINAL DAMAGE          0               0                300       221    3912 14800           196                 0
  DECEPTIVE PRACTICE       0               0                763       219    3003  9284           101                 0
  NARCOTICS                0               0                 44      1846     972  6030           499                 0
  ROBBERY                  0               0                413       849    5544 14908           822                 0
  THEFT                    0               0                425       680    4163 47221           956                 0
  VIOLENT CRIME            0               0                185      1277    3599 13357          1844                 0
  WEAPONS VIOLATION        0               0                 15       244     583  2289           208                 0

Overall Statistics
                                          
               Accuracy : 0.3749          
                 95% CI : (0.3725, 0.3774)
    No Information Rate : 0.7553          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.1122          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: ASSAULT Class: CRIMINAL DAMAGE Class: DECEPTIVE PRACTICE Class: NARCOTICS Class: ROBBERY Class: THEFT
Sensitivity                      NA                     NA                   0.34123          0.31556        0.23379       0.4097
Specificity                 0.92897                 0.8727                   0.91616          0.94859        0.86817       0.8334
Pos Pred Value                   NA                     NA                   0.05707          0.19657        0.24601       0.8835
Neg Pred Value                   NA                     NA                   0.98942          0.97204        0.86031       0.3139
Prevalence                  0.00000                 0.0000                   0.01465          0.03833        0.15539       0.7553
Detection Rate              0.00000                 0.0000                   0.00500          0.01210        0.03633       0.3094
Detection Prevalence        0.07103                 0.1273                   0.08761          0.06154        0.14767       0.3502
Balanced Accuracy                NA                     NA                   0.62870          0.63207        0.55098       0.6215
                     Class: VIOLENT CRIME Class: WEAPONS VIOLATION
Sensitivity                       0.33237                       NA
Specificity                       0.87476                  0.97812
Pos Pred Value                    0.09101                       NA
Neg Pred Value                    0.97201                       NA
Prevalence                        0.03635                  0.00000
Detection Rate                    0.01208                  0.00000
Detection Prevalence              0.13277                  0.02188
Balanced Accuracy                 0.60357                       NA
## REDES
narcotics_nn <- subset(narcotics_tr, select=-c(case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- narcotics_nn$district
narcotics_nn <- subset(narcotics_nn, select=-c(district))

narcotics_nn$arrest <- as.factor(narcotics_nn$arrest)
narcotics_nn$arrest <- as.numeric(narcotics_nn$arrest)
narcotics_nn$primary_type <- as.factor(narcotics_nn$primary_type)
narcotics_nn$primary_type <- as.numeric(narcotics_nn$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
narcotics_nn$description <- as.factor(narcotics_nn$description)
narcotics_nn$description <- as.numeric(narcotics_nn$description)
narcotics_nn$location_description <- as.factor(narcotics_nn$location_description)
narcotics_nn$location_description <- as.numeric(narcotics_nn$location_description)
narcotics_nn$month <- as.numeric(narcotics_nn$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(narcotics_nn), round(0.75*nrow(narcotics_nn)))
train <- narcotics_nn[index,]
test <- narcotics_nn[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train, grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)
[1] "primary_type"         "description"          "location_description" "arrest"               "ward"                
[6] "month"               
# main characteristics of the map
summary(crime.som)
SOM of size 8x8 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 28136 objects.
Mean distance to the closest unit in the map: 27.532.
#Showing the training process
plot(crime.som, type="changes")


#node counts
plot(crime.som, type="counts", main="Examples per Neuron")


#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
#index <- sample(nrow(target_train.sc), round(0.75*nrow(crime.sc)))
#train <- target_train.sc
#test <- target_test.sc

#train <- chicago_crime_nn_tr
#test <- chicago_crime_nn
#index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
#train <- chicago_crime_nn_tr[index,]
#test <- chicago_crime_nn_tr[-index,]
#train_label<-target_train
#test_label<-target_test

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")


#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))


plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))


# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics

           
train_label    1    2    3    4    5    6    7    8    9   10   11   12   14   15   16   17   18   19   20   22   24   25   31
         1    34   28    0  127    0   17   25    7    0    0   13    0    0   19   28    0   32   16    0    0   14    0    0
         2    11  163    6  200    0   23  102   31    0   30    4    0    0    0    0    0    1    0    0    0    0    0    0
         3     6   98   28  215    0   26  174   32    0  205   42    0    0    0    0    0    1    0    0    0    0    0    0
         4    11  103    0  458    0   24  438   19  125    0    3    0    0    0    0    0    2    0    0    0    0    0    0
         5    22    3    2  190   23   27  283   14   90   26  135    0    0  119   15    0    2    0    0    0    3  127    0
         6    12   69   22  242    0   95  552   43   31  326  178    0    0    0    0    0    2    0    0    0    0    2    0
         7    18   61    0  104    0   45 1282   51  183  139  231    0    0    0    0    0    0    0    0    0    0    1    0
         8     3    0    0    5    0    9  527   55  216  179  147    0    0    0    0    0    0    0    0    0    0    4    0
         9    12   28    8   33    0   23  477   24  306  115   81    0    0    0    0    0    0    0    0    0    0    0    0
         10    3    0   11    3    0   10   69   55   35 1749 1059    0    0    3    0    0    1    0    0    0    0    5    0
         11    1   66    4   98    2    1   85   54    0 1094 6437    0    0  120   47    0    2    2    0    0    9   52    0
         12   24   55    0  102    0   16   41   10    0   55  321    0    0    7    2    0    2    0    0    0    1    7    0
         14    1   19    0   28    5    4   22    5    0   17   79    0    0   15    2    0    0    2    0    0    0   30    0
         15    0    0    1    0    6    0   16    4    0  186 1547    0    0  563   65    0    3   10    0    0   17  176    0
         16    0    0    0    0    0    0    0    1    0    0   83    0    0   45  248    0   29   22    0    0   55   82    0
         17    0    0    0    0    0    0    1    0    0    0   31    0    0   56   31    0    5   16    0    0   15  100    0
         18    0    1    0    2    1    0    1    0    0    5   95    0    0   39   62    0   69   26    0    0   38    8    0
         19    0    1    0    1    0    0    2    0    0    0    9    0    0   15   53    0   29   77    0    0  111    5    0
         20    0    0    0    0    0    0    0    0    0    0    6    0    0   14   29    0   28   34    0    0   67   19    0
         22    0    0   17    4   10   12   17    4    2  176  124    0    0   44    3    0    0    0    0    0    0   56    0
         24    0    0    0    0    0    0    0    0    0    0    1    0    0    3   47    0   21   40    0    0  173    4    0
         25    0    0    0    0   11    0    4    1    0   22  553    0    0  243   34    0    1   18    0    0   19  299    0
         31    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0

Overall Statistics
                                          
               Accuracy : 0.4286          
                 95% CI : (0.4228, 0.4344)
    No Information Rate : 0.3973          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3245          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 1 Class: 2  Class: 3 Class: 4  Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity          0.215190 0.234532 0.2828283  0.25276 0.3965517 0.286145  0.31132 0.134146  0.30972   0.40449    0.5758        NA
Specificity          0.988348 0.985132 0.9715019  0.97246 0.9623193 0.946806  0.96532 0.960687  0.97050   0.94734    0.9035   0.97715
Pos Pred Value       0.094444 0.285464 0.0338573  0.38715 0.0212766 0.060356  0.60615 0.048035  0.27642   0.58242    0.7973        NA
Neg Pred Value       0.995536 0.980700 0.9974001  0.94976 0.9987063 0.991077  0.89101 0.986847  0.97477   0.89755    0.7636        NA
Prevalence           0.005616 0.024701 0.0035186  0.06440 0.0020614 0.011800  0.14636 0.014572  0.03512   0.15368    0.3973   0.00000
Detection Rate       0.001208 0.005793 0.0009952  0.01628 0.0008175 0.003376  0.04556 0.001955  0.01088   0.06216    0.2288   0.00000
Detection Prevalence 0.012795 0.020294 0.0293929  0.04205 0.0384205 0.055943  0.07517 0.040695  0.03934   0.10673    0.2870   0.02285
Balanced Accuracy    0.601769 0.609832 0.6271651  0.61261 0.6794355 0.616475  0.63832 0.547417  0.64011   0.67591    0.7396        NA
                     Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity                 NA   0.43142  0.372372        NA  0.300000  0.292776        NA        NA  0.331418   0.30604        NA
Specificity           0.991861   0.92430  0.988460  0.990937  0.990038  0.991892  0.992998   0.98333  0.995799   0.96664         1
Pos Pred Value              NA   0.21704  0.438938        NA  0.198847  0.254125        NA        NA  0.598616   0.24813        NA
Neg Pred Value              NA   0.97095  0.984839        NA  0.994206  0.993317        NA        NA  0.987467   0.97482        NA
Prevalence            0.000000   0.04638  0.023671  0.000000  0.008175  0.009347  0.000000   0.00000  0.018553   0.03472         0
Detection Rate        0.000000   0.02001  0.008814  0.000000  0.002452  0.002737  0.000000   0.00000  0.006149   0.01063         0
Detection Prevalence  0.008139   0.09220  0.020081  0.009063  0.012333  0.010769  0.007002   0.01667  0.010272   0.04283         0
Balanced Accuracy           NA   0.67786  0.680416        NA  0.645019  0.642334        NA        NA  0.663608   0.63634        NA
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics

          
test_label    1    2    3    4    5    6    7    8    9   10   11   12   14   15   16   17   18   19   20   22   24   25   31
        1    11    6    0   49    0    1    3    1    0    0    1    0    0    2    8    0   10    4    0    0    3    0    0
        2     4   53    0   76    0   11   28    9    0    9    2    0    0    0    0    0    0    0    0    0    0    0    0
        3     2   42    7   66    0    5   65   13    0   75   17    0    0    0    0    0    0    0    0    0    0    0    0
        4     4   42    0  145    0    6  140    6   37    0    1    0    0    0    0    0    0    0    0    0    0    1    0
        5     1    1    0   65    6    6  120    3   38   10   44    0    0   36    2    0    0    0    0    0    2   43    0
        6     3   23   10   70    0   26  178   16   20  109   65    0    0    0    0    0    0    0    0    0    0    0    0
        7     1   10    1   37    0   13  406   17   65   33   95    0    0    0    0    0    0    0    0    0    0    0    0
        8     1    0    0    1    0    1  145   11   74   57   51    0    0    0    0    0    0    0    0    0    0    2    0
        9     5    8    4   14    0   13  169    8  131   31   22    0    0    0    0    0    0    0    0    0    0    0    0
        10    1    0    6    0    0    6   20   13   13  576  359    0    0    1    0    0    1    0    0    0    0    0    0
        11    0   21    1   28    1    0   27   25    0  368 2148    0    0   41   19    0    0    0    0    0    1   27    0
        12    5   25    0   22    0    7   19    3    0   24   94    0    0    0    1    0    0    0    0    0    0    1    0
        14    0    9    0    4    0    2    3    3    0    2   42    0    0    7    1    0    0    0    0    0    0   17    0
        15    0    0    1    0    5    0    6    2    0   61  505    0    0  202   17    0    1    6    0    0    5   60    0
        16    0    0    0    0    2    0    0    1    0    0   32    0    0   12   70    0   10   12    0    0   19   30    0
        17    0    0    0    0    0    0    0    0    0    0   13    0    0   13    9    0    4    5    0    0    4   36    0
        18    0    0    0    0    0    0    0    0    0    1   20    0    0   19   16    0   17   11    0    0   13    1    0
        19    0    0    0    0    0    0    2    0    0    0    4    0    0    5   19    0   14   32    0    0   53    2    0
        20    0    0    0    0    0    0    1    0    0    0    3    0    0    3    6    0    7   16    0    0   25    8    0
        22    0    0    8    1    2    2    5    0    2   67   36    0    0   12    2    0    0    0    0    0    0   21    0
        24    0    0    0    0    0    0    0    0    0    0    1    0    0    0   13    0    9   20    0    0   61    2    0
        25    0    0    0    0    3    0    2    0    0    4  176    0    0   92   17    0    0    7    0    0    9   82    0
        31    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0

Overall Statistics
                                          
               Accuracy : 0.4248          
                 95% CI : (0.4147, 0.4349)
    No Information Rate : 0.3978          
    P-Value [Acc > NIR] : 5.519e-08       
                                          
                  Kappa : 0.3199          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 1 Class: 2  Class: 3 Class: 4  Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity          0.289474 0.220833 0.1842105  0.25087 0.3157895 0.262626  0.30321 0.083969  0.34474   0.40364    0.5757        NA
Specificity          0.990579 0.984790 0.9694893  0.97307 0.9603632 0.946767  0.96617 0.964100  0.96955   0.94718    0.9010   0.97857
Pos Pred Value       0.111111 0.276042 0.0239726  0.37958 0.0159151 0.050000  0.59882 0.032070  0.32346   0.57831    0.7935        NA
Neg Pred Value       0.997091 0.979645 0.9965885  0.95187 0.9985559 0.991760  0.89277 0.986720  0.97225   0.89849    0.7627        NA
Prevalence           0.004052 0.025589 0.0040516  0.06163 0.0020258 0.010555  0.14277 0.013967  0.04052   0.15215    0.3978   0.00000
Detection Rate       0.001173 0.005651 0.0007463  0.01546 0.0006397 0.002772  0.04329 0.001173  0.01397   0.06141    0.2290   0.00000
Detection Prevalence 0.010555 0.020471 0.0311334  0.04073 0.0401962 0.055443  0.07229 0.036571  0.04318   0.10619    0.2886   0.02143
Balanced Accuracy    0.640026 0.602812 0.5768499  0.61197 0.6380764 0.604697  0.63469 0.524035  0.65714   0.67541    0.7384        NA
                     Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity                 NA   0.45393  0.350000        NA  0.232877  0.283186        NA        NA  0.312821  0.246246        NA
Specificity           0.990404   0.92512  0.987145  0.991044  0.991296  0.989316  0.992643   0.98315  0.995100  0.965731         1
Pos Pred Value              NA   0.23192  0.372340        NA  0.173469  0.244275        NA        NA  0.575472  0.209184        NA
Neg Pred Value              NA   0.97144  0.985856        NA  0.993966  0.991241        NA        NA  0.985549  0.972071        NA
Prevalence            0.000000   0.04745  0.021324  0.000000  0.007783  0.012048  0.000000   0.00000  0.020791  0.035505         0
Detection Rate        0.000000   0.02154  0.007463  0.000000  0.001813  0.003412  0.000000   0.00000  0.006504  0.008743         0
Detection Prevalence  0.009596   0.09287  0.020045  0.008956  0.010449  0.013967  0.007357   0.01685  0.011302  0.041796         0
Balanced Accuracy           NA   0.68953  0.668572        NA  0.612086  0.636251        NA        NA  0.653960  0.605988        NA
## REDES
crimen <- subset(weapons_violation_tr, select=-c(case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- crimen$district
crimen <- subset(crimen, select=-c(district))

crimen$arrest <- as.factor(crimen$arrest)
crimen$arrest <- as.numeric(crimen$arrest)
crimen$primary_type <- as.factor(crimen$primary_type)
crimen$primary_type <- as.numeric(crimen$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
crimen$description <- as.factor(crimen$description)
crimen$description <- as.numeric(crimen$description)
crimen$location_description <- as.factor(crimen$location_description)
crimen$location_description <- as.numeric(crimen$location_description)
crimen$month <- as.numeric(crimen$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(crimen), round(0.75*nrow(crimen)))
train <- crimen[index,]
test <- crimen[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")

#training the map
crime.som <- som(train, grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)
[1] "primary_type"         "description"          "location_description" "arrest"               "ward"                
[6] "month"               
# main characteristics of the map
summary(crime.som)
SOM of size 12x12 with a hexagonal topology and a bubble neighbourhood function.
The number of data layers is 1.
Distance measure(s) used: sumofsquares.
Training data included: 10154 objects.
Mean distance to the closest unit in the map: 7.334.
#Showing the training process
plot(crime.som, type="changes")


#node counts
plot(crime.som, type="counts", main="Examples per Neuron")


#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")


#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")


#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))


plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))


# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
[1] "TRAINING PREDICTION"
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)
Confusion Matrix and Statistics

           
train_label   1   2   3   4   5   6   7   8   9  10  11  12  14  15  16  17  18  19  20  22  24  25  31
         1    7  28   8   7   3   4   0   0   0   1   1   4   1   0   2   1   1   5   0   0   0   0   0
         2    4 141  66  26   1  38  10   0   0   3   0   0   1   0   0   0   0   0   0   0   0   0   0
         3    1  48 178 103   7 152  27   0   0   7   0   0   0   0   0   0   0   0   0   0   0   0   0
         4    0  29  34 496 104  55   9   0  22   0   0   0   0   0   0   0   0   0   0   0   0   0   0
         5    0   3   2 262 462  15  12   0  11   0   0   0   0   7   0   3   1   0   0   0   0  43   0
         6    0  23  81 166  30 285 303   0   2  44   2   0   0   0   0   0   0   0   0   0   0   0   0
         7    0  28  51  55   1 100 883  25   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0
         8    0   0  24   2   1  13 302  43  42  52   1   0   0   0   0   0   0   0   0   0   0   0   0
         9    7  38  39  51  18  42 233  15 118   8   3   0   0   0   0   0   0   0   0   0   0   0   0
         10   0   0   0   9   5  41  15   0  58 675  69   0   0   9   0   0   0   0   0   0   0   3   0
         11   0  38   5   4  13   5   2   0   0 336 560   2   4 134   0   0   5   0   0   0   0  53   0
         12   7  37  10   6   0   1   0   0   0  74  68   5   3   3   0   1   2   0   0   0   0   3   0
         14   2  14   3   1  28   2   0   0   0  26  43   2   9  10   0   1   1   0   0   0   0  20   0
         15   0   0   0   0  66   1   0   0   0  39 257   0   0 183   0   2   2   1   0   0   0  88   0
         16   0   0   0   0  24   0   0   0   0   0   3   0   0   4  13   1   5  22   0   0   7  21   0
         17   0   0   0   0  49   0   0   0   0   0  13   0   0   9   2  15   1   3   0   0   4  29   0
         18   0   4   0   0   2   0   0   0   0   4  29   0   0   3   4   8  10  19   0   0   0   9   0
         19   1   1   0   0   3   0   0   0   0   0   5   0   0   0   1   0   3  45   0   0  32   2   0
         20   0   0   0   0   8   0   0   0   0   0   0   0   0   2   5   3   2  14   0   0  32   9   0
         22   0   0  26  10 104  90  20   0   1  30   4   0   0   7   0   2   0   0   0   0   0  14   0
         24   0   0   0   0   3   0   0   0   0   0   1   0   0   2   3   2   1  11   0   0 106  10   0
         25   0   2   0   0 120   1   0   0   0  10 156   0   0  83   0   4   0   1   0   0   0 190   0
         31   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

Overall Statistics
                                         
               Accuracy : 0.4357         
                 95% CI : (0.426, 0.4454)
    No Information Rate : 0.1788         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.3855         
                                         
 Mcnemar's Test P-Value : NA             

Statistics by Class:

                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity          0.2413793  0.32488  0.33776  0.41402  0.43916  0.33728  0.48623 0.518072  0.46457   0.51487   0.46091 0.3846154
Specificity          0.9934815  0.98467  0.96416  0.97175  0.96056  0.93007  0.96858 0.956608  0.95414   0.97637   0.93277 0.9787989
Pos Pred Value       0.0958904  0.48621  0.34034  0.66222  0.56273  0.30449  0.77118 0.089583  0.20629   0.76357   0.48234 0.0227273
Neg Pred Value       0.9978177  0.97030  0.96376  0.92536  0.93678  0.93925  0.89644 0.995865  0.98581   0.93139   0.92717 0.9991947
Prevalence           0.0028560  0.04274  0.05190  0.11798  0.10360  0.08322  0.17885 0.008174  0.02501   0.12911   0.11966 0.0012803
Detection Rate       0.0006894  0.01389  0.01753  0.04885  0.04550  0.02807  0.08696 0.004235  0.01162   0.06648   0.05515 0.0004924
Detection Prevalence 0.0071893  0.02856  0.05151  0.07376  0.08085  0.09218  0.11276 0.047272  0.05633   0.08706   0.11434 0.0216663
Balanced Accuracy    0.6174304  0.65478  0.65096  0.69289  0.69986  0.63367  0.72741 0.737340  0.70935   0.74562   0.69684 0.6817072
                     Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity          0.5000000   0.40132  0.433333  0.348837 0.2941176  0.371901        NA        NA   0.58564   0.38462        NA
Specificity          0.9849053   0.95298  0.991407  0.989121 0.9918972  0.995216  0.992614   0.96967   0.99669   0.96097         1
Pos Pred Value       0.0555556   0.28638  0.130000  0.120000 0.1086957  0.483871        NA        NA   0.76259   0.33510        NA
Neg Pred Value       0.9990993   0.97131  0.998309  0.997208 0.9976148  0.992446        NA        NA   0.99251   0.96829        NA
Prevalence           0.0017727   0.04491  0.002955  0.004235 0.0033484  0.011916  0.000000   0.00000   0.01783   0.04865         0
Detection Rate       0.0008864   0.01802  0.001280  0.001477 0.0009848  0.004432  0.000000   0.00000   0.01044   0.01871         0
Detection Prevalence 0.0159543   0.06293  0.009848  0.012310 0.0090605  0.009159  0.007386   0.03033   0.01369   0.05584         0
Balanced Accuracy    0.7424526   0.67715  0.712370  0.668979 0.6430074  0.683558        NA        NA   0.79116   0.67279        NA
#Prediction using the test data
print("TEST PREDICTION")
[1] "TEST PREDICTION"
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
Confusion Matrix and Statistics

          
test_label   1   2   3   4   5   6   7   8   9  10  11  12  14  15  16  17  18  19  20  22  24  25  31
        1    2  14   1   4   0   0   0   0   0   0   1   0   1   0   0   3   0   5   0   0   0   2   0
        2    1  38  19   9   1  14   4   0   0   1   0   1   3   0   0   0   0   0   0   0   0   0   0
        3    0  19  46  25   4  47  13   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
        4    0  12   7 174  35  23   8   0   7   0   0   0   0   0   0   0   0   0   0   0   0   0   0
        5    0   1   2  89 131  11   3   0   6   0   2   0   0   2   0   1   0   0   0   0   0  16   0
        6    0   8  31  61   9 103 102   0   1  18   0   0   0   0   0   0   0   0   0   0   0   0   0
        7    0  13  20  27   1  43 324  11   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
        8    0   0  12   0   2   3  98  20  18  14   0   0   0   0   0   0   0   0   0   0   0   0   0
        9    4  18  12  14  12  14  65   3  26   1   2   0   0   0   0   0   0   0   0   0   0   0   0
        10   0   0   0   2   4  20   7   0  17 188  32   0   0   5   0   0   0   0   0   0   0   0   0
        11   1   8   3   0   5   7   1   0   0 106 179   0   1  48   0   1   1   0   0   0   0  13   0
        12   5  14   3   3   0   1   0   0   0  22  21   1   2   1   0   0   1   0   0   0   0   3   0
        14   0   9   3   0   6   1   0   0   0   8  19   2   1   0   0   1   0   0   0   0   0   4   0
        15   0   0   0   0  16   1   0   0   0  15  84   0   0  55   0   1   0   1   0   0   0  32   0
        16   0   0   0   0   6   0   0   0   0   0   2   0   0   3   4   1   1   7   0   0   2   6   0
        17   0   0   0   0  22   0   0   0   0   0   3   0   0   4   1   3   0   1   0   0   0  16   0
        18   0   1   0   0   0   0   0   0   0   0  10   0   0   0   1   4   5   6   0   0   0   3   0
        19   0   1   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0  14   0   0  12   1   0
        20   0   0   0   0   3   0   0   0   0   0   0   0   0   1   3   3   0   3   0   0   7   1   0
        22   0   0   7   2  30  28   4   0   1  17   0   0   0   1   0   0   0   0   0   0   0   3   0
        24   0   0   0   0   2   0   0   0   0   0   0   0   0   1   2   0   1   7   0   0  21   4   0
        25   0   0   0   0  33   0   0   0   0   4  39   0   0  38   0   0   0   0   0   0   0  73   0
        31   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

Overall Statistics
                                          
               Accuracy : 0.4161          
                 95% CI : (0.3994, 0.4329)
    No Information Rate : 0.1859          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3626          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
Sensitivity          0.153846  0.24359  0.27711  0.42439  0.40432  0.32595  0.51510  0.58824 0.342105   0.47716    0.4543 0.2500000
Specificity          0.990804  0.98358  0.96644  0.96907  0.95654  0.92503  0.95826  0.95612 0.956167   0.97090    0.9348 0.9775148
Pos Pred Value       0.060606  0.41758  0.29870  0.65414  0.49621  0.30931  0.73804  0.11976 0.152047   0.68364    0.4786 0.0129870
Neg Pred Value       0.996717  0.96417  0.96285  0.92431  0.93814  0.93019  0.89643  0.99565 0.984438   0.93374    0.9286 0.9990928
Prevalence           0.003842  0.04610  0.04905  0.12116  0.09574  0.09338  0.18587  0.01005 0.022459   0.11643    0.1164 0.0011820
Detection Rate       0.000591  0.01123  0.01359  0.05142  0.03871  0.03044  0.09574  0.00591 0.007683   0.05556    0.0529 0.0002955
Detection Prevalence 0.009752  0.02689  0.04551  0.07861  0.07801  0.09840  0.12973  0.04935 0.050532   0.08126    0.1105 0.0227541
Balanced Accuracy    0.572325  0.61359  0.62177  0.69673  0.68043  0.62549  0.73668  0.77218 0.649136   0.72403    0.6945 0.6137574
                     Class: 14 Class: 15 Class: 16 Class: 17 Class: 18 Class: 19 Class: 20 Class: 22 Class: 24 Class: 25 Class: 31
Sensitivity          0.1250000   0.34591  0.363636 0.1666667  0.555556  0.318182        NA        NA  0.500000   0.41243        NA
Specificity          0.9843009   0.95349  0.991699 0.9860368  0.992593  0.995210  0.993794   0.97252  0.994913   0.96445         1
Pos Pred Value       0.0185185   0.26829  0.125000 0.0600000  0.166667  0.466667        NA        NA  0.552632   0.39037        NA
Neg Pred Value       0.9978979   0.96729  0.997912 0.9955009  0.998807  0.991055        NA        NA  0.993724   0.96747        NA
Prevalence           0.0023641   0.04699  0.003251 0.0053191  0.002660  0.013002  0.000000   0.00000  0.012411   0.05230         0
Detection Rate       0.0002955   0.01625  0.001182 0.0008865  0.001478  0.004137  0.000000   0.00000  0.006206   0.02157         0
Detection Prevalence 0.0159574   0.06058  0.009456 0.0147754  0.008865  0.008865  0.006206   0.02748  0.011229   0.05526         0
Balanced Accuracy    0.5546505   0.64970  0.677668 0.5763518  0.774074  0.656696        NA        NA  0.747457   0.68844        NA
library(RColorBrewer)
groups<-23
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = heat.colors(groups)[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)

---
title: "R Notebook"
output: html_notebook
---

## READ THE DATA

```{r}
library(dplyr)
library(data.table)
library(mltools)
chicago_crime <- read.table(file = "chicago_crime_clean.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)

chicago_crime$location_description <- (gsub(","," ",chicago_crime$location_description))
chicago_crime$description <- gsub(":=","",chicago_crime$description)
chicago_crime$description <- gsub(":","",chicago_crime$description)
chicago_crime$description <- gsub("MANU/POSS. W/","",chicago_crime$description)
chicago_crime$description <- gsub(",","",chicago_crime$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime$location_description <- gsub("(E.G.  UBER  LYFT)","",chicago_crime$location_description)
chicago_crime$location_description <- gsub(",","",chicago_crime$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)

chicago_crime <- chicago_crime %>%
    dplyr::mutate(year = lubridate::year(date), 
                month = lubridate::month(date), 
                day = lubridate::day(date))
chicago_crime <- na.omit(chicago_crime)
chicago_crime <- select(chicago_crime,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))
chicago_crime$district <- factor(chicago_crime$district)

unique(chicago_crime$primary_type)

head(chicago_crime)
summary(chicago_crime)
```

## Read the training set

```{r}
library(dplyr)
library(data.table)
library(mltools)
chicago_crime_tr <- read.table(file = "chicago_crime_tr.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)

chicago_crime_tr$location_description <- (gsub(","," ",chicago_crime_tr$location_description))
chicago_crime_tr$description <- gsub(":=","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(":","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub("MANU/POSS. W/","",chicago_crime_tr$description)
chicago_crime_tr$description <- gsub(",","",chicago_crime_tr$description)
#chicago_crime$description <- gsub(".","",chicago_crime$description)
chicago_crime_tr$location_description <- gsub("(E.G.  UBER  LYFT)","",chicago_crime_tr$location_description)
chicago_crime_tr$location_description <- gsub(",","",chicago_crime_tr$location_description)
#chicago_crime$location_description <- gsub(".","",chicago_crime$location_description)

chicago_crime_tr <- chicago_crime_tr %>%
    dplyr::mutate(year = lubridate::year(date), 
                month = lubridate::month(date), 
                day = lubridate::day(date))

chicago_crime_tr <- select(chicago_crime_tr,-c(X,unique_key, x_coordinate, y_coordinate, location, domestic, fbi_code, date,year))

chicago_crime_tr <- na.omit(chicago_crime_tr)
chicago_crime_tr$district <- factor(chicago_crime_tr$district)

unique(chicago_crime_tr$primary_type)

head(chicago_crime_tr)
summary(chicago_crime_tr)

chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime_tr$primary_type[chicago_crime_tr$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime_tr$primary_type <- factor(chicago_crime_tr$primary_type)

chicago_crime_subset_tr <- subset(chicago_crime_tr, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )

chicago_crime_subset_tr$primary_type <- factor(chicago_crime_subset_tr$primary_type)
chicago_crime_subset_tr <- na.omit(chicago_crime_subset_tr)
library(DataExplorer)
plot_str(chicago_crime_subset_tr)
plot_missing(chicago_crime_subset_tr)
#plot_histogram(chicago_crime_subset)
#plot_density(chicago_crime_subset)
#plot_correlation(chicago_numeric, type = 'continuous')
chicago_crime_subset_tr$month <- as.factor(chicago_crime_subset_tr$month)

plot_bar(chicago_crime_subset_tr)
```
## EXPLORATORY ANALYSIS

```{r}
library(tidyverse)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

chicago_crime %>% 
  count(primary_type)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1))

chicago_crime %>% 
  count(district)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = arrest)) +
  theme(axis.text.x = element_text(hjust = 1))

chicago_crime %>% 
  count(arrest)

#chicago_crime$primary_type <- as.character(junk$nm)
chicago_crime$primary_type[chicago_crime$primary_type == "CRIMINAL TRESPASS"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "BURGLARY"] <- "ROBBERY"
chicago_crime$primary_type[chicago_crime$primary_type == "MOTOR VEHICLE THEFT"] <- "THEFT"
chicago_crime$primary_type[chicago_crime$primary_type == "HOMICIDE"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "KIDNAPPING"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "BATTERY"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "INTIMIDATION"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "ARSON"] <- "VIOLENT CRIME"
chicago_crime$primary_type[chicago_crime$primary_type == "PROSTITUTION"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "CRIM SEXUAL ASSAULT"] <- "SEX OFFENSE"
chicago_crime$primary_type[chicago_crime$primary_type == "OTHER NARCOTIC VIOLATION"] <- "NARCOTICS"
chicago_crime$primary_type <- factor(chicago_crime$primary_type)

ggplot(data = chicago_crime) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

chicago_crime %>% 
  count(primary_type)

chicago_crime_subset <- subset(chicago_crime, primary_type=="ASSAULT" | primary_type == "VIOLENT CRIME" | primary_type == "THEFT" | primary_type=="NARCOTICS" | primary_type == "WEAPONS VIOLATION" | primary_type=="ROBBERY" | primary_type == "CRIMINAL DAMAGE" | primary_type == "DECEPTIVE PRACTICE" )
chicago_crime_subset$primary_type <- factor(chicago_crime_subset$primary_type)
ggplot(data = chicago_crime_subset) +
  geom_bar(mapping = aes(x = primary_type)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = primary_type, y = district)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = arrest, y = primary_type)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data = chicago_crime_subset) +
  geom_count(mapping = aes(x = arrest, y = district)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

```

## EXPLORATORY ANALYSIS BY CRIME

```{r}
assault <- subset(chicago_crime_subset, primary_type=="ASSAULT")
violent_crime <- subset(chicago_crime_subset, primary_type=="VIOLENT CRIME")
theft <- subset(chicago_crime_subset, primary_type=="THEFT")
narcotics <- subset(chicago_crime_subset, primary_type=="NARCOTICS")
weapons_violation <- subset(chicago_crime_subset, primary_type=="WEAPONS VIOLATION")
robbery <- subset(chicago_crime_subset, primary_type=="ROBBERY")
criminal_damage <- subset(chicago_crime_subset, primary_type=="CRIMINAL DAMAGE")
deceptive_practice <- subset(chicago_crime_subset, primary_type=="DECEPTIVE PRACTICE")

assault_tr <- subset(chicago_crime_subset_tr, primary_type=="ASSAULT")
violent_tr_crime <- subset(chicago_crime_subset_tr, primary_type=="VIOLENT CRIME")
theft_tr <- subset(chicago_crime_subset_tr, primary_type=="THEFT")
narcotics_tr <- subset(chicago_crime_subset_tr, primary_type=="NARCOTICS")
weapons_violation_tr <- subset(chicago_crime_subset_tr, primary_type=="WEAPONS VIOLATION")
robbery_tr <- subset(chicago_crime_subset_tr, primary_type=="ROBBERY")
criminal_damage_tr <- subset(chicago_crime_subset_tr, primary_type=="CRIMINAL DAMAGE")
deceptive_practice_tr <- subset(chicago_crime_subset_tr, primary_type=="DECEPTIVE PRACTICE")
```

## DISTRICTS

```{r}
library(sqldf)

districts_true <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as arrest FROM chicago_crime_subset WHERE arrest LIKE "True" GROUP BY district ORDER BY district')
districts_false <- sqldf('SELECT district, AVG(latitude) as avg_latitude,AVG(longitude) as avg_longitude, count(*) as no_arrest FROM chicago_crime_subset WHERE arrest LIKE "False" GROUP BY district ORDER BY district')
districts_true$arrest <- as.numeric(districts_true$arrest)
districts_false$no_arrest <- as.numeric(districts_false$no_arrest)
districts_true
districts_false

police_districts <- read.table(file = "Police_Stations.csv", #Name of text file.
                      sep = ",",                       #Separation character.
                      header = TRUE,                   #If column names are in the first row.
                      na.strings = "NA",               #Character to be marked as missing value.
                      stringsAsFactors = FALSE)
police_districts

police_districts$DISTRICT[police_districts$DISTRICT == "Headquarters"] <- "0"
police_districts$DISTRICT <- as.factor(police_districts$DISTRICT)

districts <- sqldf('SELECT DISTRICT as district, LATITUDE as latitude,LONGITUDE as longitude FROM police_districts')

arrest_percentage <- data.frame('District' = districts_false$district, 'PctArrest' = districts_true$arrest/(districts_true$arrest + districts_false$no_arrest), 'Crimes' = (districts_true$arrest + districts_false$no_arrest))
arrest_percentage

ggplot(data = arrest_percentage) +
  geom_col(mapping = aes(x = District, y = Crimes)) +
  geom_line(aes(x = District, y = PctArrest*10000, group = 1), color = "yellow") +
  scale_y_continuous(sec.axis = sec_axis(~./10000, name = "PctArrest")) +
  theme(axis.text.x = element_text(hjust = 1))

## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
# library("maptools")
library("KernSmooth")

setDT(districts_false)

#devtools::install_github("dkahle/ggmap", ref = "tidyup", force = TRUE)
library(ggmap)
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949, 
                                  right = -87.2713, top = 42.0677), 
                         zoom = 11)
ggmap(chicago) +
geom_text(aes(x = longitude, y = latitude, label = district), data = districts)
```


```{r}

library(ggmap)
chicago <- get_stamenmap(bbox = c(left = -88.0225, bottom = 41.5949, 
                                  right = -87.2713, top = 42.0677), 
                         zoom = 11)
ggmap(chicago) +
geom_text(aes(x = LONGITUDE, y = LATITUDE, label = DISTRICT), data = police_districts)
```

```{r}
ggplot(data = assault) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("ASSAULT BY DISTRICT")

ggplot(data = theft) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("THEFTS BY DISTRICT")

ggplot(data = violent_crime) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("VIOLENT CRIMES BY DISTRICT")

ggplot(data = narcotics) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("NARCOTIC CRIMES BY DISTRICT")

ggplot(data = weapons_violation) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("WEAPON-RELATED CRIMES BY DISTRICT")

ggplot(data = robbery) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("ROBBERIES BY DISTRICT")

ggplot(data = criminal_damage) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("CRIMINAL DAMAGE CRIMES BY DISTRICT")

ggplot(data = deceptive_practice) +
  geom_bar(mapping = aes(x = district)) +
  theme(axis.text.x = element_text(hjust = 1)) +
  ggtitle("DECEPTIVE PRACTICE CRIMES BY DISTRICT")
```
```{r}
library(ggplot2)

ggplot(data = chicago_crime_subset, aes(x=primary_type, y=district, fill=arrest)) + 
  geom_tile() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```

```{r}
# Correlation
library(ggplot2)
ggplot(chicago_crime_subset,aes(x=district,y=primary_type,color=arrest))+geom_point(alpha=0.5)
```

## Association Rules

```{r}
chicago_crime_subset_2 <- subset(chicago_crime_subset, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_subset_2 <- subset(chicago_crime_subset_2, select=-c(location_description))
write.csv(chicago_crime_subset_2,"chicago_crime_AR.csv", quote = FALSE, row.names = FALSE)
library(arules)
crime_transactions <- read.transactions("chicago_crime_AR.csv", sep=",")

#deceptive_practice_2 <- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#write.csv(deceptive_practice_2,"deceptive_practice.csv", quote = FALSE, row.names = FALSE)
#dp_transactions <- read.transactions("deceptive_practice.csv", sep=",")
```
```{r}
if (!require("RColorBrewer")) {
  # install color package of R
  install.packages("RColorBrewer")
  #include library RColorBrewer
  library(RColorBrewer)
}

itemFrequencyPlot(crime_transactions,topN=20,type="absolute",
                  col=brewer.pal(8,'Pastel2'), 
                  main="Absolute Item Frequency Plot")
```
## Reglas de Asociacion General
```{r}
# Rule GENERATION
association.rules.clean <- apriori(crime_transactions, parameter = list(supp=0.001, conf=0.7))
subset.rules.clean <- which(colSums(is.subset(association.rules.clean, association.rules.clean)) > 1)
subset.association.rules.clean. <- association.rules.clean[-subset.rules.clean]
inspect(subset.association.rules.clean.)

rules_by_count <- sort(association.rules.clean, by = "count")
rules_by_conf <- sort(association.rules.clean, by = "confidence")
rules_by_supp <- sort(association.rules.clean, by = "lift")
inspect(rules_by_count)
inspect(rules_by_conf)
inspect(rules_by_supp)
```
```{r}
# Rule GENERATION
assault.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                   appearance = list(default="lhs",rhs="ASSAULT"))
# Borrar reglas redundantes
assault.subset.rules <- which(colSums(is.subset(assault.association.rules, assault.association.rules)) > 1) # get subset rules in vector
assault.subset.association.rules. <- assault.association.rules[-assault.subset.rules] # remove subset rules.
inspect(assault.subset.association.rules.)

as_by_count <- sort(assault.association.rules, by = "count")
as_by_conf <- sort(assault.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(as_by_count)
inspect(as_by_conf)
#inspect(dp_by_supp)
```
```{r}
# Rule GENERATION
cd.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                     appearance = list(default="lhs",rhs="CRIMINAL DAMAGE"))
# Borrar reglas redundantes
cd.subset.rules <- which(colSums(is.subset(cd.association.rules, cd.association.rules)) > 1) # get subset rules in vector
cd.subset.association.rules. <- cd.association.rules[-cd.subset.rules] # remove subset rules.
inspect(cd.association.rules)

cd_by_count <- sort(cd.association.rules, by = "count")
cd_by_conf <- sort(cd.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(cd_by_count)
inspect(cd_by_conf)
#inspect(dp_by_supp)
```
```{r}
# Rule GENERATION
dp.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                     appearance = list(default="lhs",rhs="DECEPTIVE PRACTICE"))
# Borrar reglas redundantes
dp.subset.rules <- which(colSums(is.subset(dp.association.rules, dp.association.rules)) > 1) # get subset rules in vector
dp.subset.association.rules. <- dp.association.rules[-dp.subset.rules] # remove subset rules.
inspect(dp.subset.association.rules.)

dp_by_count <- sort(dp.subset.association.rules., by = "count")
dp_by_conf <- sort(dp.subset.association.rules., by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(dp_by_count)
inspect(dp_by_conf)
#inspect(dp_by_supp)
```
```{r}
narcotics_clean.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.1),
                                   appearance = list(default="lhs",rhs="NARCOTICS"))
# Borrar reglas redundantes
narcotics_clean.subset.rules <- which(colSums(is.subset(narcotics_clean.association.rules, narcotics_clean.association.rules)) > 1) # get subset rules in vector
narcotics_clean.subset.association.rules. <- narcotics_clean.association.rules[-narcotics_clean.subset.rules] # remove subset rules.
inspect(narcotics_clean.subset.association.rules.)

narc_by_count <- sort(narcotics_clean.association.rules, by = "count")
narc_by_conf <- sort(narcotics_clean.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(narc_by_count)
inspect(narc_by_conf)
#inspect(dp_by_supp)
```
```{r}
robbery.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.15),
                                   appearance = list(default="lhs",rhs="ROBBERY"))
# Borrar reglas redundantes
robbery.subset.rules <- which(colSums(is.subset(robbery.association.rules, robbery.association.rules)) > 1) 
robbery.subset.association.rules. <- robbery.association.rules[-robbery.subset.rules] # remove subset rules.
inspect(robbery.association.rules)

rob_by_count <- sort(robbery.association.rules, by = "count")
rob_by_conf <- sort(robbery.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(rob_by_count)
inspect(rob_by_conf)
#inspect(dp_by_supp)
```
```{r}
theft.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.005, conf=0.5),
                                   appearance = list(default="lhs",rhs="THEFT"))
# Borrar reglas redundantes
theft.subset.rules <- which(colSums(is.subset(theft.association.rules, theft.association.rules)) > 1) 
theft.subset.association.rules. <- theft.association.rules[-theft.subset.rules] # remove subset rules.
inspect(theft.subset.association.rules.)

theft_by_count <- sort(theft.association.rules, by = "count")
theft_by_conf <- sort(theft.association.rules, by = "confidence")
#dp_by_supp <- sort(dp.subset.association.rules., by = "support")
inspect(theft_by_count)
inspect(theft_by_conf)
#inspect(dp_by_supp)
```
```{r}
vc.association.rules <- apriori(crime_transactions, parameter = 
                                     list(supp=0.001, conf=0.15),
                                   appearance = list(default="lhs",rhs="VIOLENT CRIME"))
# Borrar reglas redundantes
vc.subset.rules <- which(colSums(is.subset(vc.association.rules, vc.association.rules)) > 1) # get subset rules in  
vc.subset.association.rules. <- vc.association.rules[-vc.subset.rules] # remove subset rules.
inspect(vc.subset.association.rules.)

vc_by_count <- sort(vc.association.rules, by = "count")
vc_by_conf <- sort(vc.association.rules, by = "confidence")
#vc_by_supp <- sort(vc.subset.association.rules., by = "support")
inspect(vc_by_count)
inspect(vc_by_conf)
#inspect(wv_by_supp)
```
```{r}
wv.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.1),
                                      appearance = list(default="lhs",rhs="WEAPONS VIOLATION"))
# Borrar reglas redundantes
wv.subset.rules <- which(colSums(is.subset(wv.association.rules, wv.association.rules)) > 1) # get subset rules in  
wv.subset.association.rules. <- wv.association.rules[-wv.subset.rules] # remove subset rules.
inspect(wv.subset.association.rules.)

wv_by_count <- sort(wv.association.rules, by = "count")
wv_by_conf <- sort(wv.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(wv_by_count)
inspect(wv_by_conf)
#inspect(wv_by_supp)
```
```{r}
true.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.5),
                                      appearance = list(default="lhs",rhs="True"))
# Borrar reglas redundantes
true.subset.rules <- which(colSums(is.subset(true.association.rules, true.association.rules)) > 1) # get subset rules in  
true.subset.association.rules. <- true.association.rules[-true.subset.rules] # remove subset rules.
inspect(true.subset.association.rules.)

t_by_count <- sort(true.subset.association.rules., by = "count")
t_by_conf <- sort(true.subset.association.rules., by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(t_by_count)
inspect(t_by_conf)
#inspect(wv_by_supp)
```
```{r}
false.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.001, conf=0.8),
                                      appearance = list(default="lhs",rhs="False"))
# Borrar reglas redundantes
false.subset.rules <- which(colSums(is.subset(false.association.rules, false.association.rules)) > 1) # get subset rules in  
false.subset.association.rules. <- false.association.rules[-false.subset.rules] # remove subset rules.
inspect(false.subset.association.rules.)

f_by_count <- sort(false.association.rules, by = "count")
f_by_conf <- sort(false.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(f_by_count)
inspect(f_by_conf)
#inspect(wv_by_supp)
```
```{r}
ocho.association.rules <- apriori(crime_transactions,parameter = 
                                      list(supp=0.0001, conf=0.01),
                                      appearance = list(default="lhs",rhs="8"))
# Borrar reglas redundantes
ocho.subset.rules <- which(colSums(is.subset(ocho.association.rules, ocho.association.rules)) > 1) # get subset rules in  
ocho.subset.association.rules. <- ocho.association.rules[-ocho.subset.rules] # remove subset rules.
inspect(ocho.subset.association.rules.)

ocho_by_count <- sort(ocho.association.rules, by = "count")
ocho_by_conf <- sort(ocho.association.rules, by = "confidence")
#wv_by_supp <- sort(wv.subset.association.rules., by = "support")
inspect(ocho_by_count)
inspect(ocho_by_conf)
#inspect(wv_by_supp)
```

```{r}
## GRAFICOS 
## Dataset Entero
library(arulesViz)
# Filter rules with confidence greater than 0.4 or 40%
subRules<-association.rules.clean[quality(association.rules.clean)$confidence>0.7]
#Plot SubRules
plot(subRules,method="two-key plot")

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(subRules, n = 25, by = "confidence")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(top10subRules, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules, n=25, by="confidence")
plot(subRules2, method="paracoord")

```
```{r}
#Plot SubRules
plot(assault.subset.association.rules.,method="two-key plot")

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(assault.subset.association.rules., n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(assault.subset.association.rules., method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(assault.subset.association.rules., n=20, by="confidence")
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(cd.association.rules,method="two-key plot")

subRules_cd<-cd.association.rules[quality(cd.association.rules)$confidence>0.2]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(cd.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_cd, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_cd, n=25, by="confidence")
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(dp.association.rules,method="two-key plot")

subRules_dp<-dp.association.rules[quality(dp.association.rules)$confidence>0.1]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(dp.association.rules, n = 10, by = "count")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_dp, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_dp, n=25, by="count")
plot(subRules_dp, method="paracoord")
```
```{r}
#Plot SubRules
plot(narcotics_clean.association.rules,method="two-key plot")

subRules_narcotics<-narcotics_clean.association.rules[quality(narcotics_clean.association.rules)$confidence>0.6]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(narcotics_clean.association.rules, n = 10, by = "confidence")

# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_narcotics, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_narcotics, n=25, by="confidence")
plot(subRules_narcotics, method="paracoord")
```
```{r}
#Plot SubRules
plot(robbery.association.rules,method="two-key plot")

subRules_robbery<-robbery.association.rules[quality(robbery.association.rules)$confidence>0.15]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(robbery.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_robbery, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation Dataset Limpio
# Filter top 20 rules with highest lift
subRules2<-head(subRules_robbery, n=25, by="confidence")
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(theft.association.rules,method="two-key plot")

subRules_theft<-theft.association.rules[quality(theft.association.rules)$confidence>0.45]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(theft.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_theft, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(vc.association.rules,method="two-key plot")

subRules_vc<-vc.association.rules[quality(vc.association.rules)$confidence>0.15]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(vc.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_vc, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```
```{r}
#Plot SubRules
plot(wv.association.rules,method="two-key plot")

subRules_wv<-wv.association.rules[quality(wv.association.rules)$confidence>0.1]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(wv.association.rules, n = 10, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_wv, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```

```{r}
#Plot SubRules
plot(ocho.association.rules,method="two-key plot")

subRules_8<-ocho.association.rules[quality(ocho.association.rules)$confidence>0.01]

## Seleccionamos un numero limitado de reglas en el dataset limpio
top10subRules <- head(ocho.association.rules, n = 20, by = "confidence")
inspect(top10subRules)
# Now, plot an interactive graph:
#Note: You can make all your plots interactive using engine=htmlwidget parameter in plot
plot(subRules_8, method = "graph",  engine = "htmlwidget")

## Individual Rule Representation
plot(top10subRules, method="paracoord")
```

## Mapas de Densidad

```{r}
## INITIALIZE
library("leaflet")
library("data.table")
library("sp")
library("rgdal")
# library("maptools")
library("KernSmooth")
library(viridis)
library(RColorBrewer)

assault <- na.omit(assault)
setDT(assault)
criminal_damage <- na.omit(criminal_damage)
setDT(criminal_damage)
deceptive_practice <- na.omit(deceptive_practice)
setDT(deceptive_practice)
narcotics <- na.omit(narcotics)
setDT(narcotics)
robbery <- na.omit(robbery)
setDT(robbery)
theft <- na.omit(theft)
setDT(theft)
violent_crime <- na.omit(violent_crime)
setDT(violent_crime)
weapons_violation <- na.omit(weapons_violation)
setDT(weapons_violation)

## MAKE CONTOUR LINES
## Assault
kde_assault <- bkde2D(assault[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_assault <- contourLines(kde_assault$x1 , kde_assault$x2 , kde_assault$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_assault<- as.factor(sapply(CL_assault, `[[`, "level"))
NLEV_assault <- length(levels(LEVS_assault))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_assault <- lapply(1:length(CL_assault), function(i)
    Polygons(list(Polygon(cbind(CL_assault[[i]]$x, CL_assault[[i]]$y))), ID=i))
spgons_assault = SpatialPolygons(pgons_assault)

## Criminal Damage
kde_cd <- bkde2D(criminal_damage[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_cd <- contourLines(kde_cd$x1 , kde_cd$x2 , kde_cd$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_cd<- as.factor(sapply(CL_cd, `[[`, "level"))
NLEV_cd <- length(levels(LEVS_cd))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_cd <- lapply(1:length(CL_cd), function(i)
    Polygons(list(Polygon(cbind(CL_cd[[i]]$x, CL_cd[[i]]$y))), ID=i))
spgons_cd = SpatialPolygons(pgons_cd)

## Deceptive Practice
kde_dp <- bkde2D(deceptive_practice[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_dp <- contourLines(kde_dp$x1 , kde_dp$x2 , kde_dp$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_dp<- as.factor(sapply(CL_dp, `[[`, "level"))
NLEV_dp <- length(levels(LEVS_dp))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_dp <- lapply(1:length(CL_dp), function(i)
    Polygons(list(Polygon(cbind(CL_dp[[i]]$x, CL_dp[[i]]$y))), ID=i))
spgons_dp = SpatialPolygons(pgons_dp)

## Narcotics
kde_narcotics <- bkde2D(narcotics[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_narcotics <- contourLines(kde_narcotics$x1 , kde_narcotics$x2 , kde_narcotics$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_narcotics <- as.factor(sapply(CL_narcotics, `[[`, "level"))
NLEV_narcotics <- length(levels(LEVS_narcotics))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_narcotics <- lapply(1:length(CL_narcotics), function(i)
    Polygons(list(Polygon(cbind(CL_narcotics[[i]]$x, CL_narcotics[[i]]$y))), ID=i))
spgons_narcotics = SpatialPolygons(pgons_narcotics)

## Robbery
kde_robbery <- bkde2D(robbery[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_robbery <- contourLines(kde_robbery$x1 , kde_robbery$x2 , kde_robbery$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_robbery <- as.factor(sapply(CL_robbery, `[[`, "level"))
NLEV_robbery <- length(levels(LEVS_robbery))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_robbery <- lapply(1:length(CL_robbery), function(i)
    Polygons(list(Polygon(cbind(CL_robbery[[i]]$x, CL_robbery[[i]]$y))), ID=i))
spgons_robbery = SpatialPolygons(pgons_robbery)

## Thefts
kde_theft <- bkde2D(theft[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_theft <- contourLines(kde_theft$x1 , kde_theft$x2 , kde_theft$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_theft <- as.factor(sapply(CL_theft, `[[`, "level"))
NLEV_theft <- length(levels(LEVS_theft))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_theft <- lapply(1:length(CL_theft), function(i)
    Polygons(list(Polygon(cbind(CL_theft[[i]]$x, CL_theft[[i]]$y))), ID=i))
spgons_theft = SpatialPolygons(pgons_theft)

## Violent Crimws
kde_vc <- bkde2D(violent_crime[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_vc <- contourLines(kde_vc$x1 , kde_vc$x2 , kde_vc$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_vc <- as.factor(sapply(CL_vc, `[[`, "level"))
NLEV_vc <- length(levels(LEVS_vc))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_vc <- lapply(1:length(CL_vc), function(i)
    Polygons(list(Polygon(cbind(CL_vc[[i]]$x, CL_vc[[i]]$y))), ID=i))
spgons_vc = SpatialPolygons(pgons_vc)

## Weapons Violation
kde_wv <- bkde2D(weapons_violation[ , list(longitude, latitude)],
              bandwidth=c(.0001, .0001), gridsize = c(75,75))
CL_wv <- contourLines(kde_wv$x1 , kde_wv$x2 , kde_wv$fhat)

## EXTRACT CONTOUR LINE LEVELS
LEVS_wv <- as.factor(sapply(CL_wv, `[[`, "level"))
NLEV_wv <- length(levels(LEVS_wv))

## CONVERT CONTOUR LINES TO POLYGONS
pgons_wv <- lapply(1:length(CL_wv), function(i)
    Polygons(list(Polygon(cbind(CL_wv[[i]]$x, CL_wv[[i]]$y))), ID=i))
spgons_wv = SpatialPolygons(pgons_wv)

leaflet() %>% addTiles() %>%
    addPolygons(data = spgons_narcotics, color = brewer.pal(NLEV_narcotics, name = "YlOrRd")[LEVS_narcotics], group = "Narcotics") %>%
    addPolygons(data = spgons_assault, color = brewer.pal(NLEV_assault, name = "Reds")[LEVS_assault], group = "Assault") %>%
    addPolygons(data = spgons_cd, color = brewer.pal(NLEV_cd, name="YlGnBu")[LEVS_cd], group = "Criminal Damage") %>%
    addPolygons(data = spgons_dp, color = brewer.pal(NLEV_dp, name = "YlGn")[LEVS_dp], group = "Deceptive Practice") %>%
    addPolygons(data = spgons_robbery, color = brewer.pal(NLEV_robbery, name = "Purples")[LEVS_robbery], group = "Robbery") %>%
    addPolygons(data = spgons_theft, color = brewer.pal(NLEV_theft, name = "Oranges")[LEVS_theft], group = "Thefts") %>%
    addPolygons(data = spgons_vc, color = brewer.pal(NLEV_vc, name = "Greys")[LEVS_vc], group = "Violent Crimes") %>%
    addPolygons(data = spgons_wv, color = brewer.pal(NLEV_wv, name = "Blues")[LEVS_wv], group = "Weapons Violation") %>%
    addLabelOnlyMarkers(districts$longitude, districts$latitude, label =  districts$district, 
                      labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T), group = "Districts") %>%
    addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))
    
#addCircles(lng = narcotics$longitude, lat = narcotics$latitude,radius = .1, opacity = .4, col = "blue", group = "Points") %>%


```
```{r}
#leaflet() %>% addTiles() %>%
#    addCircles(lng = weapons_violation$longitude, lat = weapons_violation$latitude,radius = .05, opacity = 0.1, col = brewer.pal(10,name = "Reds"), group = "Narcotics") %>%
#    addLabelOnlyMarkers(districts$longitude, districts$latitude, label =  districts$district, 
#                      labelOptions = labelOptions(noHide = T, direction = 'top', textOnly = T, textsize = "15px"), group = #"Districts") %>%
#    addLayersControl(overlayGroups = c("Assault", "Criminal Damage","Deceptive Practice", "Narcotics","Robbery","Thefts","Violent Crimes","Weapons Violation", "Districts"),options = layersControlOptions(collapsed = FALSE))

```
## Clustering 
```{r}
chicago_crime_clustering <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
unique(chicago_crime_clustering$primary_type)

library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))

types <- unique(chicago_crime_clustering$primary_type)
chicago_crime_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
chicago_crime_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#chicago_crime_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(chicago_crime_clustering$location_description))
chicago_crime_clustering$district <- as.numeric(chicago_crime_clustering$district)
test <- chicago_crime_clustering
#Normalization of variables
library(RSNNS)

train_set <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)

#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,] 
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]

#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]

## Dendograms
library(tidyverse)      #data manipulation and visualization
library(class)          # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")

distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
     main = "Agglomerative, complete linkages")

library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
                  "wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
  row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
  stats.names[i] <- paste("Test", i-1)
  
  for(j in seq_along(clust.assess)){
    output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
    
  }
  
  for(d in 1:k) {
    cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
    dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
    cluster.sizes[d, i]
    
  }
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive

stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl

#confusionMatrix(train_small, )
```
```{r}
library(data.table)
library(mltools)
#Crime_chicago_dummy <- one_hot(as.data.table(Crime_chicago_def_clust))
#narcotics <- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#narcotics_tr <- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
narcotics_clustering <- subset(narcotics, select=-c(location_description))
narcotics_clustering_tr <- subset(narcotics_tr, select=-c(location_description))

types <- unique(chicago_crime_clustering$primary_type)
narcotics_clustering$primary_type <- match(chicago_crime_clustering$primary_type, unique(chicago_crime_clustering$primary_type))
narcotics_clustering$arrest <- match(chicago_crime_clustering$arrest, unique(chicago_crime_clustering$arrest))
#narcotics_clustering$location_description <- match(chicago_crime_clustering$location_description, unique(narcotics_clustering$location_description))
narcotics_clustering$district <- as.numeric(narcotics_clustering$district)
test <- narcotics_clustering
#Normalization of variables
library(RSNNS)

train_set <- narcotics_clustering_tr
train_set$primary_type <- match(train_set$primary_type, unique(train_set$primary_type))
train_set$arrest <- match(train_set$arrest, unique(train_set$arrest))
#train_set$location_description <- match(train_set$location_description, unique(train_set$location_description))
train_set$district <- as.numeric(train_set$district)

#index <- sample(nrow(chicago_crime_clustering_cut), round(0.75*nrow(chicago_crime_clustering_cut)))
#train <- Crime_chicago_def_clust_cut[index,] 
#test <- Crime_chicago_def_clust_cut[-index,]
train_label <- train_set[,1]
test_label <- test[,1]

#Optimum number of clusters. Elbow method
# Alternative using fviz function for Elbow method
library(factoextra)
library(NbClust)
set.seed(123)
train_small <- train_set[1:1000,]
test_small <- test[1:100,]

## Dendograms
library(tidyverse)      #data manipulation and visualization
library(class)          # to call class package for kNN
library(caret)
library(cluster)
# Divisive Hierarchical Clustering - diana
# compute divisive hierarchical clustering
div <- diana(train_small)
plot(div, main = "Divisive")

distance <- dist(train_small,method = "euclidean")
agg <- hclust(distance, method = "complete")
plot(agg,
     main = "Agglomerative, complete linkages")

library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
                  "wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
  row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
  stats.names[i] <- paste("Test", i-1)
  
  for(j in seq_along(clust.assess)){
    output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
    
  }
  
  for(d in 1:k) {
    cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
    dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
    cluster.sizes[d, i]
    
  }
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
# I am capping the maximum amout of clusters by 7
# I want to choose a reasonable number, based on which I will be able to see basic differences between customer groups as a result
stats.df.divisive <- cstats.table(distance, div, 7)
stats.df.divisive

stats.df.aggl <-cstats.table(distance, agg, 7) #complete linkages looks like the most balanced approach
stats.df.aggl

#confusionMatrix(train_small, )
```

```{r}
library("ggplot2")
library("reshape2")
library("purrr")
library("dplyr")
# let's start with a dendrogram
library("dendextend")
dendro <- as.dendrogram(agg)
dendro.col <- dendro %>%
  set("branches_k_color", k = 8, value =   c("darkslategray", "darkslategray4", "darkslategray3", "gold3", "darkcyan", "cyan3", "gold3")) %>%
  set("branches_lwd", 0.6) %>%
  set("labels_colors", 
      value = c("darkslategray")) %>% 
  set("labels_cex", 0.5)
ggd1 <- as.ggdend(dendro.col)
ggplot(ggd1, theme = theme_minimal()) +
  labs(x = "Num. observations", y = "Height", title = "Dendrogram, k = 8")



```
## Arboles de Decision
```{r}
## c50

library(dplyr)
library(MASS)        # for obtaining data
library(tidyverse)  # for data processing
library(rpart)      # for CART decision tree
library(rpart.plot) # for plotting CART
library(caret)      # for confusion matrix and more
library(rsample)    # for data splitting
library(data.table)
library(C50)


#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)

chicago_crime_trees <- subset(chicago_crime_subset, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))
chicago_crime_trees_tr <- subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude))

library(dplyr)
chicago_crime_trees %>% mutate_if(is.factor, as.character) -> chicago_crime_trees
chicago_crime_trees_tr %>% mutate_if(is.factor, as.character) -> chicago_crime_trees_tr

chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees$primary_type[chicago_crime_trees$primary_type == "THEFT"] <- "TH"

chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ROBBERY"] <- "ROB"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "NARCOTICS"] <- "NAR"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "ASSAULT"] <- "ASS"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "WEAPONS VIOLATION"] <- "WV"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "CRIMINAL DAMAGE"] <- "CD"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "VIOLENT CRIME"] <- "VC"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "DECEPTIVE PRACTICE"] <- "DP"
chicago_crime_trees_tr$primary_type[chicago_crime_trees_tr$primary_type == "THEFT"] <- "TH"

#train_c50<- chicago_crime_trees_tr
#test_c50<- chicago_crime_trees

crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$primary_type <- as.factor(train_c50$primary_type)
test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(primary_type  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.2))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$primary_type)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$primary_type),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$primary_type)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$primary_type),
      "correct classified cases from", length(pred_train))


```
```{r}
#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.1))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(arrest  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.9))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,faclen=10, clip.facs=TRUE,subtree=NULL, tweak=1, digits=2)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$arrest)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$arrest),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$arrest)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$arrest),
      "correct classified cases from", length(pred_train))
```


```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#assault <- subset(assault, select=-c(location_description))
#assault_tr <- subset(assault_tr, select=-c(location_description))

assault_tr <- subset(assault_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )
assault <- subset(assault, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18" | district == "9" | district=="10" | district == "22" | district == "15" )

#train_c50<- subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(assault, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(assault_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.0001))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#criminal_damage <- subset(criminal_damage, select=-c(location_description))
#criminal_damage_tr <- subset( criminal_damage_tr, select=-c(location_description))
criminal_damage_tr <- subset(criminal_damage_tr, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18"  | district == "9" | district=="20" | district == "25" | district == "11")
criminal_damage <- subset(criminal_damage, district=="3" | district == "4" | district == "5" | district=="6" | district == "7" | district=="8" | district == "1" | district == "18"  | district == "9" | district=="20" | district == "25" | district == "11")

train_c50<- subset(criminal_damage_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(criminal_damage, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)

#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.001))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#deceptive_practice <- subset(deceptive_practice, select=-c(location_description))
#deceptive_practice_tr <- subset(deceptive_practice_tr, select=-c(location_description))

deceptive_practice_tr <- subset(deceptive_practice_tr, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20"  | district == "9" | district=="21" | district == "25" | district == "11" )

deceptive_practice <- subset(deceptive_practice, district=="19" | district == "4" | district == "25" | district=="2" | district == "24" |district == "1" | district == "18"| district == "20"  | district == "9" | district=="21" | district == "25" | district == "11" )

#train_c50<- subset(deceptive_practice_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(deceptive_practice, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(deceptive_practice_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.5))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
#levels(chicago_crime$location_description)[1] = "None"
## Creating a training and test datasets
set.seed(1234)
#narcotics <- subset(narcotics, select=-c(location_description))
#narcotics_tr <- subset(narcotics_tr, select=-c(location_description))
narcotics_tr <- subset(narcotics_tr, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")
narcotics <- subset(narcotics, district=="10" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18")

#train_c50<- subset(narcotics_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
#test_c50<- subset(narcotics, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

crime_split_c50<- initial_split(subset(narcotics_tr, select=-c(location_description,case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
train_c50<- training(crime_split_c50)
test_c50<- testing(crime_split_c50)

#train_c50$location_description <- as.factor(train_c50$location_description)
#test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
## Creating a training and test datasets
set.seed(1234)
#robbery <- subset(robbery, select=-c(location_description))
#robbery_tr <- subset(robbery_tr, select=-c(location_description))
robbery_tr <- subset(robbery_tr, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")
robbery <- subset(robbery, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4")

#robbery$location_description <- gsub("PARKING LOT","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKING LOT","PARKING",robbery_tr$location_description)
#robbery$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery$location_description)
#robbery_tr$location_description <- gsub("PARKINGGARAGE(NON.RESID.)","PARKING",robbery_tr$location_description)

train_c50<- subset(robbery_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(robbery, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
train_c50$location_description <- factor(train_c50$location_description)
test_c50$location_description <- factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~., data=train_c50, control = C5.0Control(noGlobalPruning = TRUE, CF= 0.000001))  #Higher CF less prunning
summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
## Creating a training and test datasets
set.seed(1234)
theft <- subset(theft, select=-c(location_description))
theft_tr <- subset(theft_tr, select=-c(location_description))
theft_tr <- subset(theft_tr, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")
theft <- subset(theft, district=="19" | district == "1" | district=="18"| district=="24" | district == "5" | district=="4")

train_c50<- subset(theft_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(theft, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
set.seed(1234)
#violent_crime <- subset(violent_crime, select=-c(location_description))
#violent_tr_crime <- subset(violent_tr_crime, select=-c(location_description))
violent_tr_crime <- subset(violent_tr_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20"  | district == "10" | district=="21" | district == "25" | district == "2")
violent_crime <- subset(violent_crime, district=="6" | district == "11" | district == "15" | district=="8" | district == "1" | district=="18"| district=="9" | district == "5" | district=="4" | district == "19"| district == "20"  | district == "10" | district=="21" | district == "25" | district == "2")

train_c50<- subset(violent_tr_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(violent_crime, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
#train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
#test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
train_c50$arrest <- as.factor(train_c50$arrest)
test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, trials = 10,control = C5.0Control(noGlobalPruning = FALSE, CF= 0.000001))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL,trial=9)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```

```{r}
set.seed(1234)
#weapons_violation <- subset(weapons_violation, select=-c(location_description))
#weapons_violation_tr <- subset(weapons_violation_tr, select=-c(location_description))
weapons_violation_tr <- subset(weapons_violation_tr, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")
weapons_violation <- subset(weapons_violation, district=="6" | district == "11" | district == "7" | district=="8" | district == "1" | district=="18" | district == "5" | district=="4")

train_c50<- subset(weapons_violation_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude))
test_c50<- subset(weapons_violation, select=-c(case_number,block,ward,description,day,month,latitude,longitude))

#crime_split_c50<- initial_split(subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,description,day,month,latitude,longitude)), prop=0.8)
#train_c50<- training(crime_split_c50)
#test_c50<- testing(crime_split_c50)

train_c50$location_description <- as.factor(train_c50$location_description)
test_c50$location_description <- as.factor(test_c50$location_description)
#train_c50$arrest <- as.numeric(train_c50$arrest, unique(train_c50$arrest))
#test_c50$arrest <- as.numeric(test_c50$arrest, unique(test_c50$arrest))
#train_c50$primary_type <- as.numeric(train_c50$primary_type, unique(train_c50$primary_type))
#test_c50$primary_type <- as.numeric(test_c50$primary_type, unique(test_c50$primary_type))
train_c50$location_description <- as.numeric(train_c50$location_description, unique(train_c50$location_description))
test_c50$location_description <- as.numeric(test_c50$location_description, unique(test_c50$location_description))
#train_c50$district <- as.numeric(train_c50$district, unique(train_c50$district))
#test_c50$district <- as.numeric(test_c50$district, unique(test_c50$district))
#train_c50$arrest <- as.factor(train_c50$arrest)
#test_c50$arrest <- as.factor(test_c50$arrest)
train_c50$district <- as.factor(train_c50$district)
test_c50$district <- as.factor(test_c50$district)
train_c50$district <- factor(train_c50$district)
test_c50$district <- factor(test_c50$district)
#train_c50$primary_type <- as.factor(train_c50$primary_type)
#test_c50$primary_type <- as.factor(test_c50$primary_type)


#Creating the decision tree algorithm C4.5 
tree_result <- C5.0(district  ~ ., data=train_c50, control = C5.0Control(noGlobalPruning = FALSE, CF= 0.01))  #Higher CF less prunning
#summary(tree_result)
#Plotting the tree
plot(tree_result,subtree=NULL)

## PREDICTION
#Prediction of new cases from the test dataset
predictions <- predict(tree_result, newdata = test_c50, type ="class")

#table(prediction=predictions, real= crime_test_c50$primary_type)
#crime_test_c50$primary_type <- factor(crime_test_c50$primary_type)
error_classification <- mean(predictions != test_c50$district)

paste("The classification error in test set is:", 100*error_classification, "%",
      sum(predictions==test_c50$district),
      "correct classified cases from", length(predictions))

pred_train <- predict(tree_result, newdata = train_c50)
#confusionMatrix(pred_train, crime_train$district)
#pred_train<-round(pred_train)
#table(prediction=pred_train, real= crime_train_c50$arrest)

error_classification <- mean(pred_train != train_c50$district)

paste("The classification error in train set is:", 100*error_classification, "%",
      sum(pred_train==train_c50$district),
      "correct classified cases from", length(pred_train))
```
```{r}
library(MASS)       # for obtaining data
library(tidyverse)  # for data processing
library(rpart)      # for CART decision tree
library(rpart.plot) # for plotting CART
library(caret)      # for confusion matrix and more
library(rsample)    # for data splitting
library(randomForest)  # For bagging and randomforest
library(ggpubr)

## Solo se pueden tener 53 valores categoricos unicos diferentes. Por esta razon hay que hacer regresion con solo estas variables.
chicago_crime_ensemble <- subset(chicago_crime_subset, select=-c(case_number,block,description,location_description,ward,latitude,longitude))
chicago_crime_ensemble_tr <- subset(chicago_crime_subset_tr, select=-c(case_number,location_description,description,block,ward,latitude,longitude))

chicago_crime_ensemble$primary_type <- as.numeric(chicago_crime_ensemble$primary_type)
chicago_crime_ensemble$arrest <- as.factor(chicago_crime_ensemble$arrest)
chicago_crime_ensemble$arrest <- as.numeric(chicago_crime_ensemble$arrest)
#chicago_crime_ensemble$primary_type <- as.factor(chicago_crime_ensemble$primary_type)
#chicago_crime_ensemble$description <- as.factor(chicago_crime_ensemble$description)
#chicago_crime_ensemble$description <- factor(chicago_crime_ensemble$description)
#chicago_crime_ensemble$primary_type <- factor(chicago_crime_ensemble$primary_type)

chicago_crime_ensemble_tr$primary_type <- as.numeric(chicago_crime_ensemble_tr$primary_type)
chicago_crime_ensemble_tr$arrest <- as.factor(chicago_crime_ensemble_tr$arrest)
chicago_crime_ensemble_tr$arrest <- as.numeric(chicago_crime_ensemble_tr$arrest)
#chicago_crime_ensemble_tr$primary_type <- as.factor(chicago_crime_ensemble_tr$primary_type)
#chicago_crime_ensemble_tr$description <- as.factor(chicago_crime_ensemble_tr$description)
#chicago_crime_ensemble_tr$description <- factor(chicago_crime_ensemble_tr$description)
#chicago_crime_ensemble_tr$primary_type <- factor(chicago_crime_ensemble_tr$primary_type)

chicago_crime_ensemble_tr <- na.omit(chicago_crime_ensemble_tr)
chicago_crime_ensemble <- na.omit(chicago_crime_ensemble)

#RF_split<- initial_split(chicago_crime, prop=0.8)
RF_train<- chicago_crime_ensemble_tr[1:10000,]
RF_test<- chicago_crime_ensemble[1:1000,]

bagging_model<- randomForest(formula=primary_type  ~ ., data=RF_train,mtry=4)  #4 from 13 predictors will be selected

#Result of random forest model
print(bagging_model)

```
## REDES NEURONALES
```{r}
## REDES
chicago_crime_nn <- subset(chicago_crime_subset, select=-c(case_number,block,ward,day,latitude,longitude))
chicago_crime_nn_tr <- subset(chicago_crime_subset_tr, select=-c(case_number,block,ward,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- chicago_crime_nn_tr$primary_type
chicago_crime_nn_tr <- subset(chicago_crime_nn_tr, select=-c(primary_type))

chicago_crime_nn_tr$arrest <- as.factor(chicago_crime_nn_tr$arrest)
chicago_crime_nn_tr$arrest <- as.numeric(chicago_crime_nn_tr$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr$district <- as.numeric(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
chicago_crime_nn_tr$description <- as.factor(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$description <- as.numeric(chicago_crime_nn_tr$description)
chicago_crime_nn_tr$location_description <- as.factor(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$location_description <- as.numeric(chicago_crime_nn_tr$location_description)
chicago_crime_nn_tr$month <- as.numeric(chicago_crime_nn_tr$month)
#chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)

#Scaling original data
set.seed(7)


# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr), round(0.75*nrow(chicago_crime_nn_tr)))
train <- chicago_crime_nn_tr[index,]
test <- chicago_crime_nn_tr[-index,]

train <- as.matrix(train)
test <- as.matrix(test)

train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train[1:100000,], grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}
par(mfrow=c(2,3))
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
     main=colnames(getCodes(crime.som, 1))[1],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
     main=colnames(getCodes(crime.som, 1))[2],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
     main=colnames(getCodes(crime.som, 1))[3],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,4],
     main=colnames(getCodes(crime.som, 1))[4],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,5],
     main=colnames(getCodes(crime.som, 1))[5],
     palette.name=coolBlueHOtRed)
#plot(crime.som, type="property", property=getCodes(crime.som, 1)[,6],
#     main=colnames(getCodes(crime.som, 1))[6],
#     palette.name=coolBlueHOtRed)

# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}

par(mfrow=c(5,3))
for (j in 1:ncol(train)) {
plot(crime.som, type="property", property=crime.som$codes[[1]][,j],
     palette.name=coolBlueHotRed,
     main=colnames(train)[j], cex=0.5)
}

#som.prediction <- predict(crime.som, test)
```
```{r}
#Clustering patterns in the map
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = rainbow(groups)[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)
```
```{r}

#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

# Plotting classes in neurons
plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
## REDES
chicago_crime_nn2 <- subset(chicago_crime_subset, select=-c(description,case_number,block,day,latitude,longitude))
chicago_crime_nn_tr2 <- subset(chicago_crime_subset_tr, select=-c(description,case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train2 <- chicago_crime_nn_tr2$primary_type
chicago_crime_nn_tr2 <- subset(chicago_crime_nn_tr2, select=-c(primary_type))

target_test2 <- chicago_crime_nn2$primary_type
chicago_crime_nn2 <- subset(chicago_crime_nn2, select=-c(primary_type))

chicago_crime_nn_tr2$arrest <- as.factor(chicago_crime_nn_tr2$arrest)
chicago_crime_nn_tr2$arrest <- as.numeric(chicago_crime_nn_tr2$arrest)
#chicago_crime$primary_type <- as.numeric(chicago_crime$primary_type)
chicago_crime_nn_tr2$district <- as.numeric(chicago_crime_nn_tr2$district)
#chicago_crime_nn_tr$district <- as.factor(chicago_crime_nn_tr$district)
#chicago_crime_nn_tr2$description <- as.factor(chicago_crime_nn_tr2$description)
#chicago_crime_nn_tr2$description <- as.numeric(chicago_crime_nn_tr2$description)
chicago_crime_nn_tr2$location_description <- as.factor(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$location_description <- as.numeric(chicago_crime_nn_tr2$location_description)
chicago_crime_nn_tr2$month <- as.numeric(chicago_crime_nn_tr2$month)
chicago_crime_nn_tr$month <- as.factor(chicago_crime_nn_tr$month)

#Scaling original data
set.seed(7)

# creation of training and test datasets
index <- sample(nrow(chicago_crime_nn_tr2), round(0.75*nrow(chicago_crime_nn_tr2)))
train <- chicago_crime_nn_tr2[index,]
test <- chicago_crime_nn_tr2[-index,]

train <- as.matrix(train)
test <- as.matrix(test)

train_label<-target_train2[index]
test_label<-target_train2[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train[1:100000,], grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

coolBlueHOtRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}
par(mfrow=c(2,3))
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,1],
     main=colnames(getCodes(crime.som, 1))[1],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,2],
     main=colnames(getCodes(crime.som, 1))[2],
     palette.name=coolBlueHOtRed)
plot(crime.som, type="property", property=getCodes(crime.som, 1)[,3],
     main=colnames(getCodes(crime.som, 1))[3],
     palette.name=coolBlueHOtRed)


# Alternative easier
coolBlueHotRed<-function(n, alpha=1){rainbow(n,end=4/6, 
                                             alpha=alpha)[n:1]}

```
```{r}
#Clustering patterns in the map
library(RColorBrewer)
groups<-8
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = brewer.pal(groups, name="YlGnBu")[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)
```
```{r}
#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train[1:100000,], train_label[1:100000],
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
## REDES
narcotics_nn <- subset(narcotics_tr, select=-c(case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- narcotics_nn$district
narcotics_nn <- subset(narcotics_nn, select=-c(district))

narcotics_nn$arrest <- as.factor(narcotics_nn$arrest)
narcotics_nn$arrest <- as.numeric(narcotics_nn$arrest)
narcotics_nn$primary_type <- as.factor(narcotics_nn$primary_type)
narcotics_nn$primary_type <- as.numeric(narcotics_nn$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
narcotics_nn$description <- as.factor(narcotics_nn$description)
narcotics_nn$description <- as.numeric(narcotics_nn$description)
narcotics_nn$location_description <- as.factor(narcotics_nn$location_description)
narcotics_nn$location_description <- as.numeric(narcotics_nn$location_description)
narcotics_nn$month <- as.numeric(narcotics_nn$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(narcotics_nn), round(0.75*nrow(narcotics_nn)))
train <- narcotics_nn[index,]
test <- narcotics_nn[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")

#training the map
crime.som <- som(train, grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

```
```{r}
#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=8, ydim=8, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
## REDES
crimen <- subset(weapons_violation_tr, select=-c(case_number,block,day,latitude,longitude))

library(kohonen) # for building the SOM map
library(caret)     #for confusion matrix

#Extracting target variable
target_train <- crimen$district
crimen <- subset(crimen, select=-c(district))

crimen$arrest <- as.factor(crimen$arrest)
crimen$arrest <- as.numeric(crimen$arrest)
crimen$primary_type <- as.factor(crimen$primary_type)
crimen$primary_type <- as.numeric(crimen$primary_type)
#narcotics_nn$district <- as.numeric(narcotics_nn$district)
#chicago_crime_nn$district <- as.factor(chicago_crime_nn$district)
crimen$description <- as.factor(crimen$description)
crimen$description <- as.numeric(crimen$description)
crimen$location_description <- as.factor(crimen$location_description)
crimen$location_description <- as.numeric(crimen$location_description)
crimen$month <- as.numeric(crimen$month)

#Scaling original data
set.seed(7)
#target_train.sc<-scale(chicago_crime_nn_tr)
#target_test.sc<-scale(chicago_crime_nn)

# creation of training and test datasets
index <- sample(nrow(crimen), round(0.75*nrow(crimen)))
train <- crimen[index,]
test <- crimen[-index,]

#train <- chicago_crime_nn_tr
train <- as.matrix(train)
#test <- chicago_crime_nn
test <- as.matrix(test)

#train_label<-target_train
#test_label<-target_test
train_label<-target_train[index]
test_label<-target_train[-index]

#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")

#training the map
crime.som <- som(train, grid=som_grid, 
               rlen=100, alpha=c(0.05, 0.01), 
               radius= 2, keep.data=T)

# Names of the variables used
colnames(train)

# main characteristics of the map
summary(crime.som)

#Showing the training process
plot(crime.som, type="changes")

#node counts
plot(crime.som, type="counts", main="Examples per Neuron")

#Codes/Weight vectors
plot(crime.som, type="codes", main="Patterns Discovered")

#Scaling original data
set.seed(7)

#main characteristics of the map
som_grid<-somgrid(xdim=12, ydim=12, topo="hexagonal")
set.seed(7)
kohmap <- xyf(train, train_label,
              grid=som_grid, 
              rlen=100, alpha=c(0.05, 0.01), 
              radius= 2, keep.data=T)

#Showing the training process
plot(kohmap, type="changes")

#Showing distribution of wine labels in neurons
plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c("Crime"))

plot(kohmap, type="codes",  codeRendering = "lines", shape="straight",
     main=c(" patterns"))

# Plotting classes in neurons
#plot(kohmap, type="mapping", labels=as.numeric(train_label)+3,
#     col=as.numeric(train_label)+3, pch=4, main="Map of classes", palette.name = terrain.colors)

#Prediction using the training data
print("TRAINING PREDICTION")
kohmap.predict_tr<- predict(kohmap, newdata=train, whatmap = 1)
prediction_table_tr<-table(train_label,kohmap.predict_tr$predictions[[2]])
confusionMatrix(prediction_table_tr)

#Prediction using the test data
print("TEST PREDICTION")
kohmap.predict<- predict(kohmap, newdata=test, whatmap = 1)
prediction_table<-table(test_label,kohmap.predict$predictions[[2]])
confusionMatrix(prediction_table)
```
```{r}
library(RColorBrewer)
groups<-23
#Applying hierarchical clustering for grouping patterns
crime.hc=cutree(hclust(dist(crime.som$codes[[1]])), groups)
plot(crime.som, type="codes", bgcol = heat.colors(groups)[crime.hc],
     main="clustering the patterns discovered")
add.cluster.boundaries(crime.som,crime.hc)
```



